diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
| commit | c6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch) | |
| tree | 36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank/v2/parse | |
| parent | 0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff) | |
| download | jbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz | |
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/parse')
3 files changed, 159 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go new file mode 100644 index 0000000..0f6ec91 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go @@ -0,0 +1,52 @@ +package parse + +// Rule interface and its methods make possible the polimorf usage of process +// how Rule retrieve tokens from text. +type Rule interface { + IsWordSeparator(rune rune) bool + IsSentenceSeparator(rune rune) bool +} + +// RuleDefault struct implements the Rule interface. It contains the separator +// characters and can decide a character is separator or not. +type RuleDefault struct { + wordSeparators [21]string + sentenceSeparators [3]string +} + +// NewRule constructor retrieves a RuleDefault pointer. +func NewRule() *RuleDefault { + return &RuleDefault{ + [21]string{" ", ",", "'", "’", "\"", ")", "(", "[", "]", "{", "}", "\"", ";", "\n", ">", "<", "%", "@", "&", "=", "#"}, + [3]string{"!", ".", "?"}, + } +} + +// IsWordSeparator method retrieves true when a character is a kind of special +// character and possibly it separates to words from each other. It also checks +// for sentence separator by IsSentenceSeparator method. +func (r *RuleDefault) IsWordSeparator(rune rune) bool { + chr := string(rune) + + for _, val := range r.wordSeparators { + if chr == val { + return true + } + } + + return r.IsSentenceSeparator(rune) +} + +// IsSentenceSeparator method retrieves true when a character is a kind of +// special character and possibly it separates to words from each other. +func (r *RuleDefault) IsSentenceSeparator(rune rune) bool { + chr := string(rune) + + for _, val := range r.sentenceSeparators { + if chr == val { + return true + } + } + + return false +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go new file mode 100644 index 0000000..aab27c3 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go @@ -0,0 +1,44 @@ +package parse + +// Text struct contains a parsed text. +type Text struct { + parsedSentences []ParsedSentence +} + +// ParsedSentence struct contains the original raw sentences and their words. +type ParsedSentence struct { + original string + words []string +} + +// Append method creates a sentence and its words and append them to the Text +// object. +func (text *Text) Append(rawSentence string, words []string) { + if len(words) > 0 { + parsedSentence := ParsedSentence{ + original: rawSentence, + words: words, + } + + text.parsedSentences = append( + text.parsedSentences, + parsedSentence, + ) + } +} + +// GetSentences method returns ParsedSentence slice from Text struct. +func (text *Text) GetSentences() []ParsedSentence { + return text.parsedSentences +} + +// GetWords methods returns the words string slice of ParsedSentence struct. +func (parsedSentence *ParsedSentence) GetWords() []string { + return parsedSentence.words +} + +// GetOriginal method returns the original sentence as a string from a +// ParsedSentence struct. +func (parsedSentence *ParsedSentence) GetOriginal() string { + return parsedSentence.original +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go new file mode 100644 index 0000000..003460e --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go @@ -0,0 +1,63 @@ +package parse + +import ( + "strings" +) + +// TokenizeText function use the given raw text and parses by a Rule object and +// retrieves the parsed text in a Text struct object. +func TokenizeText(rawText string, rule Rule) Text { + return findSentences(rawText, rule) +} + +func findSentences(rawText string, rule Rule) Text { + text := Text{} + + var sentence string + var i int + slen := len(rawText) + + for j, chr := range rawText { + j += len(string(chr)) + //when separator or the last + if rule.IsSentenceSeparator(chr) || j == slen { + sentence = rawText[i:j] + if len(sentence) > 0 { + text.Append(sentence, findWords(sentence, rule)) + } + + sentence = "" + i = j + } + } + + return text +} + +func findWords(rawSentence string, rule Rule) (words []string) { + words = []string{} + + var word string + var i int + slen := len(rawSentence) + + for j, chr := range rawSentence { + chrlen := len(string(chr)) + j += chrlen + //when separator or the last + if sep := rule.IsWordSeparator(chr); sep || j == slen { + if sep { + word = rawSentence[i : j-chrlen] + } else { + word = rawSentence[i:j] + } + if len(word) > 0 { + words = append(words, strings.ToLower(word)) + } + word = "" + i = j + } + } + + return +} |
