Added vendor lock on depsHEAD master

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2024-10-25 00:47:47 +0200
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2024-10-25 00:47:47 +0200
commit: c6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree: 36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
parent: 0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
download: jbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz
1 files changed, 63 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
new file mode 100644
index 0000000..003460e
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
@@ -0,0 +1,63 @@
+package parse
+
+import (
+	"strings"
+)
+
+// TokenizeText function use the given raw text and parses by a Rule object and
+// retrieves the parsed text in a Text struct object.
+func TokenizeText(rawText string, rule Rule) Text {
+	return findSentences(rawText, rule)
+}
+
+func findSentences(rawText string, rule Rule) Text {
+	text := Text{}
+
+	var sentence string
+	var i int
+	slen := len(rawText)
+
+	for j, chr := range rawText {
+		j += len(string(chr))
+		//when separator or the last
+		if rule.IsSentenceSeparator(chr) || j == slen {
+			sentence = rawText[i:j]
+			if len(sentence) > 0 {
+				text.Append(sentence, findWords(sentence, rule))
+			}
+
+			sentence = ""
+			i = j
+		}
+	}
+
+	return text
+}
+
+func findWords(rawSentence string, rule Rule) (words []string) {
+	words = []string{}
+
+	var word string
+	var i int
+	slen := len(rawSentence)
+
+	for j, chr := range rawSentence {
+		chrlen := len(string(chr))
+		j += chrlen
+		//when separator or the last
+		if sep := rule.IsWordSeparator(chr); sep || j == slen {
+			if sep {
+				word = rawSentence[i : j-chrlen]
+			} else {
+				word = rawSentence[i:j]
+			}
+			if len(word) > 0 {
+				words = append(words, strings.ToLower(word))
+			}
+			word = ""
+			i = j
+		}
+	}
+
+	return
+}
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2024-10-25 00:47:47 +0200
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2024-10-25 00:47:47 +0200
commit	c6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree	36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
parent	0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
download	jbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz