diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
| commit | c6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch) | |
| tree | 36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go | |
| parent | 0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff) | |
| download | jbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz | |
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go')
| -rw-r--r-- | vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go new file mode 100644 index 0000000..003460e --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go @@ -0,0 +1,63 @@ +package parse + +import ( + "strings" +) + +// TokenizeText function use the given raw text and parses by a Rule object and +// retrieves the parsed text in a Text struct object. +func TokenizeText(rawText string, rule Rule) Text { + return findSentences(rawText, rule) +} + +func findSentences(rawText string, rule Rule) Text { + text := Text{} + + var sentence string + var i int + slen := len(rawText) + + for j, chr := range rawText { + j += len(string(chr)) + //when separator or the last + if rule.IsSentenceSeparator(chr) || j == slen { + sentence = rawText[i:j] + if len(sentence) > 0 { + text.Append(sentence, findWords(sentence, rule)) + } + + sentence = "" + i = j + } + } + + return text +} + +func findWords(rawSentence string, rule Rule) (words []string) { + words = []string{} + + var word string + var i int + slen := len(rawSentence) + + for j, chr := range rawSentence { + chrlen := len(string(chr)) + j += chrlen + //when separator or the last + if sep := rule.IsWordSeparator(chr); sep || j == slen { + if sep { + word = rawSentence[i : j-chrlen] + } else { + word = rawSentence[i:j] + } + if len(word) > 0 { + words = append(words, strings.ToLower(word)) + } + word = "" + i = j + } + } + + return +} |
