summaryrefslogtreecommitdiff
path: root/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
committerMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
commitc6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
parent0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
downloadjbmafp-master.tar.gz
Added vendor lock on depsHEADmaster
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go')
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go63
1 files changed, 63 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
new file mode 100644
index 0000000..003460e
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
@@ -0,0 +1,63 @@
+package parse
+
+import (
+ "strings"
+)
+
+// TokenizeText function use the given raw text and parses by a Rule object and
+// retrieves the parsed text in a Text struct object.
+func TokenizeText(rawText string, rule Rule) Text {
+ return findSentences(rawText, rule)
+}
+
+func findSentences(rawText string, rule Rule) Text {
+ text := Text{}
+
+ var sentence string
+ var i int
+ slen := len(rawText)
+
+ for j, chr := range rawText {
+ j += len(string(chr))
+ //when separator or the last
+ if rule.IsSentenceSeparator(chr) || j == slen {
+ sentence = rawText[i:j]
+ if len(sentence) > 0 {
+ text.Append(sentence, findWords(sentence, rule))
+ }
+
+ sentence = ""
+ i = j
+ }
+ }
+
+ return text
+}
+
+func findWords(rawSentence string, rule Rule) (words []string) {
+ words = []string{}
+
+ var word string
+ var i int
+ slen := len(rawSentence)
+
+ for j, chr := range rawSentence {
+ chrlen := len(string(chr))
+ j += chrlen
+ //when separator or the last
+ if sep := rule.IsWordSeparator(chr); sep || j == slen {
+ if sep {
+ word = rawSentence[i : j-chrlen]
+ } else {
+ word = rawSentence[i:j]
+ }
+ if len(word) > 0 {
+ words = append(words, strings.ToLower(word))
+ }
+ word = ""
+ i = j
+ }
+ }
+
+ return
+}