summaryrefslogtreecommitdiff
path: root/vendor/github.com/DavidBelicza/TextRank/v2/parse
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
committerMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
commitc6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank/v2/parse
parent0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
downloadjbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz
Added vendor lock on depsHEADmaster
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/parse')
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go52
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go44
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go63
3 files changed, 159 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go
new file mode 100644
index 0000000..0f6ec91
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go
@@ -0,0 +1,52 @@
+package parse
+
+// Rule interface and its methods make possible the polimorf usage of process
+// how Rule retrieve tokens from text.
+type Rule interface {
+ IsWordSeparator(rune rune) bool
+ IsSentenceSeparator(rune rune) bool
+}
+
+// RuleDefault struct implements the Rule interface. It contains the separator
+// characters and can decide a character is separator or not.
+type RuleDefault struct {
+ wordSeparators [21]string
+ sentenceSeparators [3]string
+}
+
+// NewRule constructor retrieves a RuleDefault pointer.
+func NewRule() *RuleDefault {
+ return &RuleDefault{
+ [21]string{" ", ",", "'", "’", "\"", ")", "(", "[", "]", "{", "}", "\"", ";", "\n", ">", "<", "%", "@", "&", "=", "#"},
+ [3]string{"!", ".", "?"},
+ }
+}
+
+// IsWordSeparator method retrieves true when a character is a kind of special
+// character and possibly it separates to words from each other. It also checks
+// for sentence separator by IsSentenceSeparator method.
+func (r *RuleDefault) IsWordSeparator(rune rune) bool {
+ chr := string(rune)
+
+ for _, val := range r.wordSeparators {
+ if chr == val {
+ return true
+ }
+ }
+
+ return r.IsSentenceSeparator(rune)
+}
+
+// IsSentenceSeparator method retrieves true when a character is a kind of
+// special character and possibly it separates to words from each other.
+func (r *RuleDefault) IsSentenceSeparator(rune rune) bool {
+ chr := string(rune)
+
+ for _, val := range r.sentenceSeparators {
+ if chr == val {
+ return true
+ }
+ }
+
+ return false
+}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go
new file mode 100644
index 0000000..aab27c3
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go
@@ -0,0 +1,44 @@
+package parse
+
+// Text struct contains a parsed text.
+type Text struct {
+ parsedSentences []ParsedSentence
+}
+
+// ParsedSentence struct contains the original raw sentences and their words.
+type ParsedSentence struct {
+ original string
+ words []string
+}
+
+// Append method creates a sentence and its words and append them to the Text
+// object.
+func (text *Text) Append(rawSentence string, words []string) {
+ if len(words) > 0 {
+ parsedSentence := ParsedSentence{
+ original: rawSentence,
+ words: words,
+ }
+
+ text.parsedSentences = append(
+ text.parsedSentences,
+ parsedSentence,
+ )
+ }
+}
+
+// GetSentences method returns ParsedSentence slice from Text struct.
+func (text *Text) GetSentences() []ParsedSentence {
+ return text.parsedSentences
+}
+
+// GetWords methods returns the words string slice of ParsedSentence struct.
+func (parsedSentence *ParsedSentence) GetWords() []string {
+ return parsedSentence.words
+}
+
+// GetOriginal method returns the original sentence as a string from a
+// ParsedSentence struct.
+func (parsedSentence *ParsedSentence) GetOriginal() string {
+ return parsedSentence.original
+}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
new file mode 100644
index 0000000..003460e
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
@@ -0,0 +1,63 @@
+package parse
+
+import (
+ "strings"
+)
+
+// TokenizeText function use the given raw text and parses by a Rule object and
+// retrieves the parsed text in a Text struct object.
+func TokenizeText(rawText string, rule Rule) Text {
+ return findSentences(rawText, rule)
+}
+
+func findSentences(rawText string, rule Rule) Text {
+ text := Text{}
+
+ var sentence string
+ var i int
+ slen := len(rawText)
+
+ for j, chr := range rawText {
+ j += len(string(chr))
+ //when separator or the last
+ if rule.IsSentenceSeparator(chr) || j == slen {
+ sentence = rawText[i:j]
+ if len(sentence) > 0 {
+ text.Append(sentence, findWords(sentence, rule))
+ }
+
+ sentence = ""
+ i = j
+ }
+ }
+
+ return text
+}
+
+func findWords(rawSentence string, rule Rule) (words []string) {
+ words = []string{}
+
+ var word string
+ var i int
+ slen := len(rawSentence)
+
+ for j, chr := range rawSentence {
+ chrlen := len(string(chr))
+ j += chrlen
+ //when separator or the last
+ if sep := rule.IsWordSeparator(chr); sep || j == slen {
+ if sep {
+ word = rawSentence[i : j-chrlen]
+ } else {
+ word = rawSentence[i:j]
+ }
+ if len(word) > 0 {
+ words = append(words, strings.ToLower(word))
+ }
+ word = ""
+ i = j
+ }
+ }
+
+ return
+}