aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/textrank.go')
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/textrank.go194
1 files changed, 194 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go
new file mode 100644
index 0000000..ed48ce3
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go
@@ -0,0 +1,194 @@
1package textrank
2
3import (
4 "github.com/DavidBelicza/TextRank/v2/convert"
5 "github.com/DavidBelicza/TextRank/v2/parse"
6 "github.com/DavidBelicza/TextRank/v2/rank"
7)
8
9// TextRank structure contains the Rank data object. This structure is a wrapper
10// around the whole text ranking functionality.
11type TextRank struct {
12 rank *rank.Rank
13}
14
15// NewTextRank constructor retrieves a TextRank pointer. This is the 1th step to
16// use TextRank.
17func NewTextRank() *TextRank {
18 return &TextRank{
19 rank.NewRank(),
20 }
21}
22
23// NewDefaultRule function retrieves a default Rule object what works in the
24// most cases in English or similar Latin languages like French or Spanish. The
25// Rule defines raw text how should be split to sentences and words. Because
26// Rule is an interface it's possible modify the ranking by inject different
27// Rule implementation. This is the 2nd step to use TextRank.
28func NewDefaultRule() *parse.RuleDefault {
29 return parse.NewRule()
30}
31
32// NewDefaultLanguage function retrieves a default Language object. It defines
33// what words are real and what words are just Stop Words or useless Junk Words.
34// It uses the default English Stop Words, but it's possible to set different
35// Stop Words in English or any other languages. Because Language is an
36// interface it's possible to modify the ranking by inject different Language
37// implementation. This is the 3rd step to use TextRank.
38func NewDefaultLanguage() *convert.LanguageDefault {
39 return convert.NewLanguage()
40}
41
42// NewDefaultAlgorithm function retrieves an Algorithm object. It defines how
43// should work the text ranking algorithm, the weighting. This is the general
44// text rank by weighting the connection between the words to find the strongest
45// phrases. Because Algorithm is an interface it's possible to modify the
46// ranking algorithm by inject different implementation. This is the 4th step to
47// use TextRank.
48func NewDefaultAlgorithm() *rank.AlgorithmDefault {
49 return rank.NewAlgorithmDefault()
50}
51
52// NewChainAlgorithm function retrieves an Algorithm object. It defines how
53// should work the text ranking algorithm, the weighting. This is an alternative
54// way to ranking words by weighting the number of the words. Because Algorithm
55// is an interface it's possible to modify the ranking algorithm by inject
56// different implementation. This is the 4th step to use TextRank.
57func NewChainAlgorithm() *rank.AlgorithmChain {
58 return rank.NewAlgorithmChain()
59}
60
61// Populate method adds a raw text to the text-ranking graph. It parses,
62// tokenize the raw text and prepares it to weighting and scoring. It's possible
63// to append a new raw text to an existing one even if the previously text is
64// already ranked. This is 5th step to use TextRank.
65//
66// text string must be a plain text from TXT or PDF or any document, it can
67// contain new lines, break lines or any unnecessary text parts, but it should
68// not contain HTML tags or codes.
69//
70// lang Language object can be loaded from NewDefaultLanguage function.
71//
72// rule Rule object can be loaded from NewDefaultRule function.
73func (textRank *TextRank) Populate(
74 text string,
75 lang convert.Language,
76 rule parse.Rule,
77) {
78 parsedText := parse.TokenizeText(text, rule)
79
80 for _, sentence := range parsedText.GetSentences() {
81 convert.TextToRank(sentence, lang, textRank.rank)
82 }
83}
84
85// Ranking method counts the words and connections between the words, then it
86// weights the numbers then normalize them in type float32 between 0.00 and
87// 1.00. This is the 6th step to use TextRank.
88//
89// algorithm Algorithm is the object of the weighting and scoring methods.
90func (textRank *TextRank) Ranking(algorithm rank.Algorithm) {
91 rank.Calculate(textRank.rank, algorithm)
92}
93
94// GetRankData method retrieves the Rank data to that case if the developer want
95// access to the whole graph and sentences, words, weights and all of the data
96// to analyze it or just implement a new search logic or finder method.
97func (textRank *TextRank) GetRankData() *rank.Rank {
98 return textRank.rank
99}
100
101// FindPhrases function retrieves a slice of Phrase structures by TextRank
102// object. The return value contains the sorted phrases with IDs, words, weights
103// and quantities by weight from 1 to 0. Weight is calculated from quantities of
104// relation between two words. A single phrase is from two words - not less and
105// more. (But it's possible to find chain of phrases by
106// FindSentencesByPhraseChain function.)
107func FindPhrases(textRank *TextRank) []rank.Phrase {
108 return rank.FindPhrases(textRank.rank)
109}
110
111// FindSingleWords function retrieves a slice of SingleWord structures by
112// TextRank object. The return value contains the sorted words with IDs, words,
113// weights and quantities by weight from 1 to 0. Weight is calculated from
114// quantities of word.
115func FindSingleWords(textRank *TextRank) []rank.SingleWord {
116 return rank.FindSingleWords(textRank.rank)
117}
118
119// FindSentencesByRelationWeight function retrieves a slice of Sentence
120// structures by TextRank object. The return value contains the ID of the
121// sentence and the sentence text itself. The slice is sorted by weight of
122// phrases from 1 to 0.
123func FindSentencesByRelationWeight(
124 textRank *TextRank,
125 limit int,
126) []rank.Sentence {
127
128 return rank.FindSentences(textRank.rank, rank.ByRelation, limit)
129}
130
131// FindSentencesByWordQtyWeight function retrieves a slice of Sentence
132// structures by TextRank object. The return value contains the ID of the
133// sentence and the sentence text itself. The slice is sorted by weight of word
134// quantities from 1 to 0.
135func FindSentencesByWordQtyWeight(
136 textRank *TextRank,
137 limit int,
138) []rank.Sentence {
139
140 return rank.FindSentences(textRank.rank, rank.ByQty, limit)
141}
142
143// FindSentencesByPhraseChain function retrieves a slice of Sentence structures
144// by TextRank object and slice of phrases. The return value contains the ID of
145// the sentence and the sentence text itself. The slice is sorted by weight of
146// word quantities from 1 to 0.
147//
148// textRank TextRank is the object of the TextRank.
149//
150// phrases []string is a slice of phrases. A single phrase is from two words, so
151// when the slice contains 3 words the inner method will search for two phrases.
152// The search algorithm seeks for "len(phrases)!". In case of three item the
153// possible combination is 3 factorial (3!) = 3 * 2 * 1.
154//
155// rawText := "Long raw text, lorem ipsum..."
156// rule := NewDefaultRule()
157// language := NewDefaultLanguage()
158// algorithm := NewDefaultAlgorithm()
159//
160// Append(rawText, language, rule, 1)
161// Ranking(1, algorithm)
162//
163// FindSentencesByPhraseChain(1, []string{
164// "captain",
165// "james",
166// "kirk",
167// })
168//
169// The above code searches for captain james kirk, captain kirk james, james
170// kirk captain, james captain kirk, kirk james captain and james kirk captain
171// combinations in the graph. The 3 of words have to be related to each other
172// in the same sentence but the search algorithm ignores the stop words. So if
173// there is a sentence "James Kirk is the Captain of the Enterprise." the
174// sentence will be returned because the words "is" and "the" are stop words.
175func FindSentencesByPhraseChain(
176 textRank *TextRank,
177 phrases []string,
178) []rank.Sentence {
179
180 return rank.FindSentencesByPhrases(textRank.rank, phrases)
181}
182
183// FindSentencesFrom function retrieves a slice of Sentence structures by
184// TextRank object and by ID of the sentence. The return value contains the
185// sentence text itself. The returned slice contains sentences sorted by their
186// IDs started from the given sentence ID in ascending sort.
187func FindSentencesFrom(
188 textRank *TextRank,
189 sentenceID int,
190 limit int,
191) []rank.Sentence {
192
193 return rank.FindSentencesFrom(textRank.rank, sentenceID, limit)
194}