From c6cc0108ca7738023b45e0eeac0fa2390532dd93 Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Fri, 25 Oct 2024 00:47:47 +0200 Subject: Added vendor lock on deps --- .../DavidBelicza/TextRank/v2/parse/tokenizer.go | 63 ++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go') diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go new file mode 100644 index 0000000..003460e --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go @@ -0,0 +1,63 @@ +package parse + +import ( + "strings" +) + +// TokenizeText function use the given raw text and parses by a Rule object and +// retrieves the parsed text in a Text struct object. +func TokenizeText(rawText string, rule Rule) Text { + return findSentences(rawText, rule) +} + +func findSentences(rawText string, rule Rule) Text { + text := Text{} + + var sentence string + var i int + slen := len(rawText) + + for j, chr := range rawText { + j += len(string(chr)) + //when separator or the last + if rule.IsSentenceSeparator(chr) || j == slen { + sentence = rawText[i:j] + if len(sentence) > 0 { + text.Append(sentence, findWords(sentence, rule)) + } + + sentence = "" + i = j + } + } + + return text +} + +func findWords(rawSentence string, rule Rule) (words []string) { + words = []string{} + + var word string + var i int + slen := len(rawSentence) + + for j, chr := range rawSentence { + chrlen := len(string(chr)) + j += chrlen + //when separator or the last + if sep := rule.IsWordSeparator(chr); sep || j == slen { + if sep { + word = rawSentence[i : j-chrlen] + } else { + word = rawSentence[i:j] + } + if len(word) > 0 { + words = append(words, strings.ToLower(word)) + } + word = "" + i = j + } + } + + return +} -- cgit v1.2.3