aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/DavidBelicza
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/DavidBelicza')
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/.gitignore6
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml10
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile9
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/LICENSE21
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/README.md543
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go43
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go71
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go332
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/doc.go445
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh4
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go52
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go44
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go63
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go99
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go147
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go66
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go77
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go202
-rw-r--r--vendor/github.com/DavidBelicza/TextRank/v2/textrank.go194
19 files changed, 2428 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/.gitignore b/vendor/github.com/DavidBelicza/TextRank/v2/.gitignore
new file mode 100644
index 0000000..f83ccd4
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/.gitignore
@@ -0,0 +1,6 @@
1/.vscode
2/.idea
3/pkg
4/bin
5/install.sh
6/vendor
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml b/vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml
new file mode 100644
index 0000000..899b6a3
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml
@@ -0,0 +1,10 @@
1language: go
2sudo: false
3
4matrix:
5 include:
6 - go: "1.15"
7
8script:
9 - go mod vendor
10 - go test ./...
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile b/vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile
new file mode 100644
index 0000000..d31839a
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile
@@ -0,0 +1,9 @@
1FROM golang:1.15
2MAINTAINER David Belicza
3
4ADD ./ /go/src/github.com/DavidBelicza/TextRank
5
6WORKDIR /go/src/github.com/DavidBelicza/TextRank
7
8CMD go mod vendor
9CMD /bin/bash
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/LICENSE b/vendor/github.com/DavidBelicza/TextRank/v2/LICENSE
new file mode 100644
index 0000000..960d66b
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/LICENSE
@@ -0,0 +1,21 @@
1The MIT License (MIT)
2
3Copyright 2018 David Belicza
4
5Permission is hereby granted, free of charge, to any person obtaining a copy of
6this software and associated documentation files (the "Software"), to deal in
7the Software without restriction, including without limitation the rights to
8use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
9of the Software, and to permit persons to whom the Software is furnished to do
10so, subject to the following conditions:
11
12The above copyright notice and this permission notice shall be included in all
13copies or substantial portions of the Software.
14
15THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21SOFTWARE.
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/README.md b/vendor/github.com/DavidBelicza/TextRank/v2/README.md
new file mode 100644
index 0000000..7b17b64
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/README.md
@@ -0,0 +1,543 @@
1<h1 align="center">
2TextRank on Go
3</h1>
4
5<p align="center">
6 <a href="https://godoc.org/github.com/DavidBelicza/TextRank">
7 <img src="https://godoc.org/github.com/DavidBelicza/TextRank?status.svg" alt="GoDoc" />
8 </a>
9 <a href="https://github.com/DavidBelicza/TextRank/blob/master/LICENSE">
10 <img src="https://img.shields.io/badge/License-MIT-ee00ee.svg" alt="License: MIT" />
11 </a>
12 <a href="https://travis-ci.org/DavidBelicza/TextRank">
13 <img src="https://travis-ci.org/DavidBelicza/TextRank.svg?branch=master" alt="Build Status" />
14 </a>
15 <a href="https://goreportcard.com/report/github.com/DavidBelicza/TextRank">
16 <img src="https://goreportcard.com/badge/github.com/DavidBelicza/TextRank" alt="Go Report Card" />
17 </a>
18 <a href="https://coveralls.io/github/DavidBelicza/TextRank?branch=master">
19 <img src="https://coveralls.io/repos/github/DavidBelicza/TextRank/badge.svg?branch=master" alt="Coverage Status" />
20 </a>
21 <a href="https://github.com/DavidBelicza/TextRank/releases/latest">
22 <img src="https://img.shields.io/github/release/DavidBelicza/TextRank.svg?colorB=269aca" alt="Release" />
23 </a>
24
25</p>
26
27<p align="center">
28This source code is an implementation of textrank algorithm, under MIT licence.
29<br />The minimum requred Go version is 1.8.
30<p align="center">
31<br />
32
33## MOTIVATION
34
35If there was a program what could rank book size text's words, phrases and sentences continuously on multiple threads and it would be opened to modifing by objects, written in a simple, secure, static language and if it would be very well documented... Now, here it is.
36
37## FEATURES
38
39* Find the most important phrases.
40* Find the most important words.
41* Find the most important N sentences.
42 * Importance by phrase weights.
43 * Importance by word occurrence.
44* Find the first N sentences, start from Xth sentence.
45* Find sentences by phrase chains ordered by position in text.
46* Access to the whole ranked data.
47* Support more languages.
48* Algorithm for weighting can be modified by interface implementation.
49* Parser can be modified by interface implementation.
50* Multi thread support.
51
52## INSTALL
53
54You can install TextRank by Go's get:
55
56```go get github.com/DavidBelicza/TextRank```
57
58TextRank uses the default Go *mod* as vendoring tool, so you can install the dependencies with this command:
59
60```go mod vendor```
61
62## DOCKER
63
64Using Docker to TextRank isn't necessary, it's just an option.
65
66Build image from the repository's root directory:
67
68```docker build -t go_text_rank_image .```
69
70Create container from the image:
71
72```docker run -dit --name textrank go_text_rank_image:latest```
73
74Run the **go test -v .** code inside the container:
75
76```docker exec -i -t textrank go test -v .```
77
78Stop, start or remove the container:
79
80* ```docker stop textrank```
81* ```docker start textrank```
82* ```docker rm textrank```
83
84## HOW DOES IT WORK
85
86Too see how does it work, the easiest way is to use the sample text. Sample text can be found in the [textrank_test.go file at this line](https://github.com/DavidBelicza/TextRank/blob/master/textrank_test.go#L12). It's a short size text about Gnome Shell.
87
88* TextRank reads the text,
89 * parse it,
90 * remove the unnecessary stop words,
91 * tokenize it
92* and counting the occurrence of the words and phrases
93* and then it starts weighting
94 * by the occurrence of words and phrases and their relations.
95* After weights are done, TextRank normalize weights to between 1 and 0.
96* Then the different finder methods capable to find the most important words, phrases or sentences.
97
98The most important phrases from the sample text are:
99
100Phrase | Occurrence | Weight
101--- | --- | ---
102gnome - shell | 5 | 1
103extension - gnome | 3 | 0.50859946
104icons - tray | 3 | 0.49631447
105gnome - caffeine | 2 | 0.27027023
106
107The **gnome** is the most often used word in this text and **shell** is also used multiple times. Two of them are used together as a phrase 5 times. This is the highest occurrence in this text, so this is the most important phrase.
108
109The following two important phrases have same occurrence 3, however they are not equal. This is because the **extension gnome** phrase contains the word **gnome**, the most popular word in the text, and it increases the phrase's weight. It increases the weight of any word what is related to it, but not too much to overcome other important phrases what don't contain the **gnome** word.
110
111The exact algorithm can be found in the [algorithm.go file at this line](https://github.com/DavidBelicza/TextRank/blob/master/rank/algorithm.go#L65).
112
113## TEXTRANK OR AUTOMATIC SUMMARIZATION
114> Automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document. Technologies that can make a coherent summary take into account variables such as length, writing style and syntax. Automatic data summarization is part of machine learning and data mining. The main idea of summarization is to find a representative subset of the data, which contains the information of the entire set. Summarization technologies are used in a large number of sectors in industry today. - Wikipedia
115
116## EXAMPLES
117
118### Find the most important phrases
119
120This is the most basic and simplest usage of textrank.
121
122```go
123package main
124
125import (
126 "fmt"
127
128 "github.com/DavidBelicza/TextRank/v2"
129)
130
131func main() {
132 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
133 // TextRank object
134 tr := textrank.NewTextRank()
135 // Default Rule for parsing.
136 rule := textrank.NewDefaultRule()
137 // Default Language for filtering stop words.
138 language := textrank.NewDefaultLanguage()
139 // Default algorithm for ranking text.
140 algorithmDef := textrank.NewDefaultAlgorithm()
141
142 // Add text.
143 tr.Populate(rawText, language, rule)
144 // Run the ranking.
145 tr.Ranking(algorithmDef)
146
147 // Get all phrases by weight.
148 rankedPhrases := textrank.FindPhrases(tr)
149
150 // Most important phrase.
151 fmt.Println(rankedPhrases[0])
152 // Second important phrase.
153 fmt.Println(rankedPhrases[1])
154}
155```
156
157### All possible pre-defined finder queries
158
159After ranking, the graph contains a lot of valuable data. There are functions in textrank package what contains logic to retrieve those data from the graph.
160
161```go
162package main
163
164import (
165 "fmt"
166
167 "github.com/DavidBelicza/TextRank/v2"
168)
169
170func main() {
171 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
172 // TextRank object
173 tr := textrank.NewTextRank()
174 // Default Rule for parsing.
175 rule := textrank.NewDefaultRule()
176 // Default Language for filtering stop words.
177 language := textrank.NewDefaultLanguage()
178 // Default algorithm for ranking text.
179 algorithmDef := textrank.NewDefaultAlgorithm()
180
181 // Add text.
182 tr.Populate(rawText, language, rule)
183 // Run the ranking.
184 tr.Ranking(algorithmDef)
185
186 // Get all phrases order by weight.
187 rankedPhrases := textrank.FindPhrases(tr)
188 // Most important phrase.
189 fmt.Println(rankedPhrases[0])
190
191 // Get all words order by weight.
192 words := textrank.FindSingleWords(tr)
193 // Most important word.
194 fmt.Println(words[0])
195
196 // Get the most important 10 sentences. Importance by phrase weights.
197 sentences := textrank.FindSentencesByRelationWeight(tr, 10)
198 // Found sentences
199 fmt.Println(sentences)
200
201 // Get the most important 10 sentences. Importance by word occurrence.
202 sentences = textrank.FindSentencesByWordQtyWeight(tr, 10)
203 // Found sentences
204 fmt.Println(sentences)
205
206 // Get the first 10 sentences, start from 5th sentence.
207 sentences = textrank.FindSentencesFrom(tr, 5, 10)
208 // Found sentences
209 fmt.Println(sentences)
210
211 // Get sentences by phrase/word chains order by position in text.
212 sentencesPh := textrank.FindSentencesByPhraseChain(tr, []string{"gnome", "shell", "extension"})
213 // Found sentence.
214 fmt.Println(sentencesPh[0])
215}
216```
217
218### Access to everything
219
220After ranking, the graph contains a lot of valuable data. The GetRank function allows access to the graph and every data can be retrieved from this structure.
221
222```go
223package main
224
225import (
226 "fmt"
227
228 "github.com/DavidBelicza/TextRank/v2"
229)
230
231func main() {
232 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
233 // TextRank object
234 tr := textrank.NewTextRank()
235 // Default Rule for parsing.
236 rule := textrank.NewDefaultRule()
237 // Default Language for filtering stop words.
238 language := textrank.NewDefaultLanguage()
239 // Default algorithm for ranking text.
240 algorithmDef := textrank.NewDefaultAlgorithm()
241
242 // Add text.
243 tr.Populate(rawText, language, rule)
244 // Run the ranking.
245 tr.Ranking(algorithmDef)
246
247 // Get the rank graph.
248 rankData := tr.GetRankData()
249
250 // Get word ID by token/word.
251 wordId := rankData.WordValID["gnome"]
252
253 // Word's weight.
254 fmt.Println(rankData.Words[wordId].Weight)
255 // Word's quantity/occurrence.
256 fmt.Println(rankData.Words[wordId].Qty)
257 // All sentences what contain the this word.
258 fmt.Println(rankData.Words[wordId].SentenceIDs)
259 // All other words what are related to this word on left side.
260 fmt.Println(rankData.Words[wordId].ConnectionLeft)
261 // All other words what are related to this word on right side.
262 fmt.Println(rankData.Words[wordId].ConnectionRight)
263 // The node of this word, it contains the related words and the relation weight.
264 fmt.Println(rankData.Relation.Node[wordId])
265}
266```
267
268### Adding text continuously
269
270It is possibe to add more text after another texts already have been added. The Ranking function can merge these multiple texts and it can recalculate the weights and all related data.
271
272```go
273package main
274
275import (
276 "fmt"
277
278 "github.com/DavidBelicza/TextRank/v2"
279)
280
281func main() {
282 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
283 // TextRank object
284 tr := textrank.NewTextRank()
285 // Default Rule for parsing.
286 rule := textrank.NewDefaultRule()
287 // Default Language for filtering stop words.
288 language := textrank.NewDefaultLanguage()
289 // Default algorithm for ranking text.
290 algorithmDef := textrank.NewDefaultAlgorithm()
291
292 // Add text.
293 tr.Populate(rawText, language, rule)
294 // Run the ranking.
295 tr.Ranking(algorithmDef)
296
297 rawText2 := "Another book or article..."
298 rawText3 := "Third another book or article..."
299
300 // Add text to the previously added text.
301 tr.Populate(rawText2, language, rule)
302 // Add text to the previously added text.
303 tr.Populate(rawText3, language, rule)
304 // Run the ranking to the whole composed text.
305 tr.Ranking(algorithmDef)
306
307 // Get all phrases by weight.
308 rankedPhrases := textrank.FindPhrases(tr)
309
310 // Most important phrase.
311 fmt.Println(rankedPhrases[0])
312 // Second important phrase.
313 fmt.Println(rankedPhrases[1])
314}
315```
316
317### Using different algorithm to ranking text
318
319There are two algorithm has implemented, it is possible to write custom algorithm by Algorithm interface and use it instead of defaults.
320
321```go
322package main
323
324import (
325 "fmt"
326
327 "github.com/DavidBelicza/TextRank/v2"
328)
329
330func main() {
331 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
332 // TextRank object
333 tr := textrank.NewTextRank()
334 // Default Rule for parsing.
335 rule := textrank.NewDefaultRule()
336 // Default Language for filtering stop words.
337 language := textrank.NewDefaultLanguage()
338 // Using a little bit more complex algorithm to ranking text.
339 algorithmChain := textrank.NewChainAlgorithm()
340
341 // Add text.
342 tr.Populate(rawText, language, rule)
343 // Run the ranking.
344 tr.Ranking(algorithmChain)
345
346 // Get all phrases by weight.
347 rankedPhrases := textrank.FindPhrases(tr)
348
349 // Most important phrase.
350 fmt.Println(rankedPhrases[0])
351 // Second important phrase.
352 fmt.Println(rankedPhrases[1])
353}
354```
355
356### Using multiple graphs
357
358Graph ID exists because it is possible run multiple independent text ranking processes.
359
360```go
361package main
362
363import (
364 "fmt"
365
366 "github.com/DavidBelicza/TextRank/v2"
367)
368
369func main() {
370 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
371 // 1th TextRank object
372 tr1 := textrank.NewTextRank()
373 // Default Rule for parsing.
374 rule := textrank.NewDefaultRule()
375 // Default Language for filtering stop words.
376 language := textrank.NewDefaultLanguage()
377 // Default algorithm for ranking text.
378 algorithmDef := textrank.NewDefaultAlgorithm()
379
380 // Add text.
381 tr1.Populate(rawText, language, rule)
382 // Run the ranking.
383 tr1.Ranking(algorithmDef)
384
385 // 2nd TextRank object
386 tr2 := textrank.NewTextRank()
387
388 // Using a little bit more complex algorithm to ranking text.
389 algorithmChain := textrank.NewChainAlgorithm()
390
391 // Add text to the second graph.
392 tr2.Populate(rawText, language, rule)
393 // Run the ranking on the second graph.
394 tr2.Ranking(algorithmChain)
395
396 // Get all phrases by weight from first graph.
397 rankedPhrases := textrank.FindPhrases(tr1)
398
399 // Most important phrase from first graph.
400 fmt.Println(rankedPhrases[0])
401 // Second important phrase from first graph.
402 fmt.Println(rankedPhrases[1])
403
404 // Get all phrases by weight from second graph.
405 rankedPhrases2 := textrank.FindPhrases(tr2)
406
407 // Most important phrase from second graph.
408 fmt.Println(rankedPhrases2[0])
409 // Second important phrase from second graph.
410 fmt.Println(rankedPhrases2[1])
411}
412```
413
414### Using different non-English languages
415
416Engish is used by default but it is possible to add any language. To use other languages a stop word list is required what you can find here: https://github.com/stopwords-iso
417
418```go
419package main
420
421import (
422 "fmt"
423
424 "github.com/DavidBelicza/TextRank/v2"
425)
426
427func main() {
428 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
429 // TextRank object
430 tr := textrank.NewTextRank()
431 // Default Rule for parsing.
432 rule := textrank.NewDefaultRule()
433 // Default Language for filtering stop words.
434 language := textrank.NewDefaultLanguage()
435
436 // Add Spanish stop words (just some example).
437 language.SetWords("es", []string{"uno", "dos", "tres", "yo", "es", "eres"})
438 // Active the Spanish.
439 language.SetActiveLanguage("es")
440
441 // Default algorithm for ranking text.
442 algorithmDef := textrank.NewDefaultAlgorithm()
443
444 // Add text.
445 tr.Populate(rawText, language, rule)
446 // Run the ranking.
447 tr.Ranking(algorithmDef)
448
449 // Get all phrases by weight.
450 rankedPhrases := textrank.FindPhrases(tr)
451
452 // Most important phrase.
453 fmt.Println(rankedPhrases[0])
454 // Second important phrase.
455 fmt.Println(rankedPhrases[1])
456}
457```
458
459### Asynchronous usage by goroutines
460
461It is thread safe. Independent graphs can receive texts in the same time and can be extended by more text also in the same time.
462
463```go
464package main
465
466import (
467 "fmt"
468 "time"
469
470 "github.com/DavidBelicza/TextRank/v2"
471)
472
473func main() {
474 // A flag when program has to stop.
475 stopProgram := false
476 // Channel.
477 stream := make(chan string)
478 // TextRank object.
479 tr := textrank.NewTextRank()
480
481 // Open new thread/routine
482 go func(tr *textrank.TextRank) {
483 // 3 texts.
484 rawTexts := []string{
485 "Very long text...",
486 "Another very long text...",
487 "Second another very long text...",
488 }
489
490 // Add 3 texts to the stream channel, one by one.
491 for _, rawText := range rawTexts {
492 stream <- rawText
493 }
494 }(tr)
495
496 // Open new thread/routine
497 go func() {
498 // Counter how many times texts added to the ranking.
499 i := 1
500
501 for {
502 // Get text from stream channel when it got a new one.
503 rawText := <-stream
504
505 // Default Rule for parsing.
506 rule := textrank.NewDefaultRule()
507 // Default Language for filtering stop words.
508 language := textrank.NewDefaultLanguage()
509 // Default algorithm for ranking text.
510 algorithm := textrank.NewDefaultAlgorithm()
511
512 // Add text.
513 tr.Populate(rawText, language, rule)
514 // Run the ranking.
515 tr.Ranking(algorithm)
516
517 // Set stopProgram flag to true when all 3 text have been added.
518 if i == 3 {
519 stopProgram = true
520 }
521
522 i++
523 }
524 }()
525
526 // The main thread has to run while go-routines run. When stopProgram is
527 // true then the loop has finish.
528 for !stopProgram {
529 time.Sleep(time.Second * 1)
530 }
531
532 // Most important phrase.
533 phrases := textrank.FindPhrases(tr)
534 // Second important phrase.
535 fmt.Println(phrases[0])
536}
537```
538
539## A SIMPLE VISUAL REPRESENTATION
540
541The below image is a representation how works the simplest text ranking algorithm. This algorithm can be replaced by an another one by inject different Algorithm interface implementation.
542
543<img src="https://i.imgur.com/RUdDfBz.jpg" />
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go b/vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go
new file mode 100644
index 0000000..db94cfc
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go
@@ -0,0 +1,43 @@
1package convert
2
3import (
4 "github.com/DavidBelicza/TextRank/v2/parse"
5 "github.com/DavidBelicza/TextRank/v2/rank"
6)
7
8// TextToRank function converts a ParsedSentence object to Rank object, it is
9// the preparing process to later text ranking.
10func TextToRank(sentence parse.ParsedSentence, lang Language, ranks *rank.Rank) {
11 sentenceId := addSentence(ranks, sentence)
12 addWord(ranks, sentence.GetWords(), lang, sentenceId)
13}
14
15func addWord(ranks *rank.Rank, words []string, lang Language, sentenceID int) {
16 prevWordID := -1
17 var curWordID int
18
19 for _, word := range words {
20 if !lang.IsStopWord(word) {
21 if found, rootWord := lang.FindRootWord(word); found {
22 word = rootWord
23 }
24
25 if !ranks.IsWordExist(word) {
26 curWordID = ranks.AddNewWord(word, prevWordID, sentenceID)
27 } else {
28 curWordID = ranks.UpdateWord(word, prevWordID, sentenceID)
29 }
30
31 ranks.Relation.AddRelation(curWordID, prevWordID, sentenceID)
32 ranks.UpdateRightConnection(prevWordID, curWordID)
33
34 prevWordID = curWordID
35 }
36 }
37}
38
39func addSentence(ranks *rank.Rank, sentence parse.ParsedSentence) int {
40 ranks.SentenceMap[len(ranks.SentenceMap)] = sentence.GetOriginal()
41
42 return len(ranks.SentenceMap) - 1
43}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go b/vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go
new file mode 100644
index 0000000..fdad698
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go
@@ -0,0 +1,71 @@
1package convert
2
3import "unicode/utf8"
4
5// Language interface and its methods make possible the polimorf usage of
6// language specific features by custom implementations.
7type Language interface {
8 IsStopWord(word string) bool
9 FindRootWord(word string) (bool, string)
10 SetActiveLanguage(code string)
11 SetWords(code string, words []string)
12}
13
14// LanguageDefault struct is implementation of Language interface. It stores
15// the stop words of loaded languages and can find stop words by tokens.
16type LanguageDefault struct {
17 defaultLang string
18 languages map[string][]string
19}
20
21// NewLanguage constructor of the LanguageDefault Retrieves a pointer
22// LanguageDefault. It has setup to English by default.
23func NewLanguage() *LanguageDefault {
24 lang := &LanguageDefault{
25 "en",
26 make(map[string][]string),
27 }
28
29 words := getDefaultEnglish()
30
31 lang.SetWords("en", words)
32
33 return lang
34}
35
36// IsStopWord method retrieves true when the given word is in the stop word
37// list or when the word has less character then 2.
38func (lang *LanguageDefault) IsStopWord(word string) bool {
39 if utf8.RuneCountInString(word) <= 2 {
40 return true
41 }
42
43 if stopWords, ok := lang.languages[lang.defaultLang]; ok {
44 for _, val := range stopWords {
45 if val == word {
46 return true
47 }
48 }
49 }
50
51 return false
52}
53
54// FindRootWord method gets a word as an input, "apples" for example and it
55// retrieves the root-word of this given word, "apple" for example. The first
56// return parameter is true when a word-root has found, otherwise it's false.
57func (lang *LanguageDefault) FindRootWord(word string) (bool, string) {
58 return false, ""
59}
60
61// SetActiveLanguage method switch between languages by the language's code. The
62// language code is not standard, it can be anything.
63func (lang *LanguageDefault) SetActiveLanguage(code string) {
64 lang.defaultLang = code
65}
66
67// SetWords method set stop words into the LanguageDefault struct by the
68// language's code.
69func (lang *LanguageDefault) SetWords(code string, words []string) {
70 lang.languages[code] = words
71}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go b/vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go
new file mode 100644
index 0000000..8977a2d
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go
@@ -0,0 +1,332 @@
1package convert
2
3func getDefaultEnglish() []string {
4
5 return []string{
6 "a",
7 "about",
8 "above",
9 "above",
10 "across",
11 "after",
12 "afterwards",
13 "again",
14 "against",
15 "all",
16 "almost",
17 "alone",
18 "along",
19 "already",
20 "also",
21 "although",
22 "always",
23 "am",
24 "among",
25 "amongst",
26 "amount",
27 "an",
28 "and",
29 "another",
30 "any",
31 "anyhow",
32 "anyone",
33 "anything",
34 "anyway",
35 "anywhere",
36 "are",
37 "around",
38 "as",
39 "at",
40 "back",
41 "be",
42 "became",
43 "because",
44 "become",
45 "becomes",
46 "becoming",
47 "been",
48 "before",
49 "beforehand",
50 "behind",
51 "being",
52 "below",
53 "beside",
54 "besides",
55 "between",
56 "beyond",
57 "bill",
58 "both",
59 "bottom",
60 "but",
61 "by",
62 "call",
63 "can",
64 "cannot",
65 "cant",
66 "co",
67 "con",
68 "could",
69 "couldn't",
70 "cry",
71 "de",
72 "describe",
73 "detail",
74 "did",
75 "didn't",
76 "do",
77 "does",
78 "doesn't",
79 "done",
80 "don't",
81 "down",
82 "due",
83 "during",
84 "each",
85 "eg",
86 "eight",
87 "either",
88 "eleven",
89 "else",
90 "elsewhere",
91 "empty",
92 "enough",
93 "etc",
94 "even",
95 "ever",
96 "every",
97 "everyone",
98 "everything",
99 "everywhere",
100 "except",
101 "few",
102 "fifteen",
103 "fify",
104 "fill",
105 "find",
106 "fire",
107 "first",
108 "five",
109 "for",
110 "former",
111 "formerly",
112 "forty",
113 "found",
114 "four",
115 "from",
116 "front",
117 "full",
118 "further",
119 "get",
120 "give",
121 "go",
122 "had",
123 "has",
124 "hasnt",
125 "have",
126 "he",
127 "hence",
128 "her",
129 "here",
130 "hereafter",
131 "hereby",
132 "herein",
133 "hereupon",
134 "hers",
135 "herself",
136 "him",
137 "himself",
138 "his",
139 "how",
140 "however",
141 "hundred",
142 "i",
143 "ie",
144 "if",
145 "in",
146 "inc",
147 "indeed",
148 "interest",
149 "into",
150 "is",
151 "it",
152 "its",
153 "itself",
154 "keep",
155 "last",
156 "latter",
157 "latterly",
158 "least",
159 "less",
160 "ltd",
161 "made",
162 "many",
163 "may",
164 "me",
165 "meanwhile",
166 "might",
167 "mill",
168 "mine",
169 "more",
170 "moreover",
171 "most",
172 "mostly",
173 "move",
174 "much",
175 "must",
176 "my",
177 "myself",
178 "name",
179 "namely",
180 "neither",
181 "never",
182 "nevertheless",
183 "next",
184 "nine",
185 "no",
186 "nobody",
187 "none",
188 "noone",
189 "nor",
190 "not",
191 "nothing",
192 "now",
193 "nowhere",
194 "of",
195 "off",
196 "often",
197 "oh",
198 "on",
199 "once",
200 "one",
201 "only",
202 "onto",
203 "or",
204 "other",
205 "others",
206 "otherwise",
207 "our",
208 "ours",
209 "ourselves",
210 "out",
211 "over",
212 "own",
213 "part",
214 "per",
215 "perhaps",
216 "please",
217 "put",
218 "rather",
219 "re",
220 "same",
221 "see",
222 "seem",
223 "seemed",
224 "seeming",
225 "seems",
226 "serious",
227 "several",
228 "she",
229 "should",
230 "show",
231 "side",
232 "since",
233 "sincere",
234 "six",
235 "sixty",
236 "so",
237 "some",
238 "somehow",
239 "someone",
240 "something",
241 "sometime",
242 "sometimes",
243 "somewhere",
244 "still",
245 "such",
246 "system",
247 "take",
248 "ten",
249 "than",
250 "that",
251 "the",
252 "their",
253 "them",
254 "themselves",
255 "then",
256 "thence",
257 "there",
258 "thereafter",
259 "thereby",
260 "therefore",
261 "therein",
262 "thereupon",
263 "these",
264 "they",
265 "thickv",
266 "thin",
267 "third",
268 "this",
269 "those",
270 "though",
271 "three",
272 "through",
273 "throughout",
274 "thru",
275 "thus",
276 "to",
277 "together",
278 "too",
279 "top",
280 "toward",
281 "towards",
282 "twelve",
283 "twenty",
284 "two",
285 "un",
286 "under",
287 "until",
288 "up",
289 "upon",
290 "us",
291 "very",
292 "via",
293 "was",
294 "we",
295 "well",
296 "were",
297 "what",
298 "whatever",
299 "when",
300 "whence",
301 "whenever",
302 "where",
303 "whereafter",
304 "whereas",
305 "whereby",
306 "wherein",
307 "whereupon",
308 "wherever",
309 "whether",
310 "which",
311 "while",
312 "whither",
313 "who",
314 "whoever",
315 "whole",
316 "whom",
317 "whose",
318 "why",
319 "will",
320 "with",
321 "within",
322 "without",
323 "would",
324 "yes",
325 "yet",
326 "you",
327 "your",
328 "yours",
329 "yourself",
330 "yourselves",
331 }
332}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/doc.go b/vendor/github.com/DavidBelicza/TextRank/v2/doc.go
new file mode 100644
index 0000000..51c8cc6
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/doc.go
@@ -0,0 +1,445 @@
1/*
2Package textrank is an implementation of Text Rank algorithm in Go with
3extendable features (automatic summarization, phrase extraction). It supports
4multithreading by goroutines. The package is under The MIT Licence.
5
6MOTIVATION
7
8If there was a program what could rank book size text's words, phrases and
9sentences continuously on multiple threads and it would be opened to modifing by
10objects, written in a simple, secure, static language and if it would be very
11well documented... Now, here it is.
12
13FEATURES
14
15- Find the most important phrases.
16- Find the most important words.
17- Find the most important N sentences.
18- Importance by phrase weights.
19- Importance by word occurrence.
20- Find the first N sentences, start from Xth sentence.
21- Find sentences by phrase chains ordered by position in text.
22- Access to the whole ranked data.
23- Support more languages.
24- Algorithm for weighting can be modified by interface implementation.
25- Parser can be modified by interface implementation.
26- Multi thread support.
27
28EXAMPLES
29
30Find the most important phrases:
31
32This is the most basic and simplest usage of textrank.
33
34 package main
35
36 import (
37 "fmt"
38
39 "github.com/DavidBelicza/TextRank"
40 )
41
42 func main() {
43 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
44 // TextRank object
45 tr := textrank.NewTextRank()
46 // Default Rule for parsing.
47 rule := textrank.NewDefaultRule()
48 // Default Language for filtering stop words.
49 language := textrank.NewDefaultLanguage()
50 // Default algorithm for ranking text.
51 algorithmDef := textrank.NewDefaultAlgorithm()
52
53 // Add text.
54 tr.Populate(rawText, language, rule)
55 // Run the ranking.
56 tr.Ranking(algorithmDef)
57
58 // Get all phrases by weight.
59 rankedPhrases := textrank.FindPhrases(tr)
60
61 // Most important phrase.
62 fmt.Println(rankedPhrases[0])
63 // Second important phrase.
64 fmt.Println(rankedPhrases[1])
65 }
66
67All possible pre-defined finder queries:
68
69After ranking, the graph contains a lot of valuable data. There are functions in
70textrank package what contains logic to retrieve those data from the graph.
71
72 package main
73
74 import (
75 "fmt"
76
77 "github.com/DavidBelicza/TextRank"
78 )
79
80 func main() {
81 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
82 // TextRank object
83 tr := textrank.NewTextRank()
84 // Default Rule for parsing.
85 rule := textrank.NewDefaultRule()
86 // Default Language for filtering stop words.
87 language := textrank.NewDefaultLanguage()
88 // Default algorithm for ranking text.
89 algorithmDef := textrank.NewDefaultAlgorithm()
90
91 // Add text.
92 tr.Populate(rawText, language, rule)
93 // Run the ranking.
94 tr.Ranking(algorithmDef)
95
96 // Get all phrases order by weight.
97 rankedPhrases := textrank.FindPhrases(tr)
98 // Most important phrase.
99 fmt.Println(rankedPhrases[0])
100
101 // Get all words order by weight.
102 words := textrank.FindSingleWords(tr)
103 // Most important word.
104 fmt.Println(words[0])
105
106 // Get the most important 10 sentences. Importance by phrase weights.
107 sentences := textrank.FindSentencesByRelationWeight(tr, 10)
108 // Found sentences
109 fmt.Println(sentences)
110
111 // Get the most important 10 sentences. Importance by word occurrence.
112 sentences = textrank.FindSentencesByWordQtyWeight(tr, 10)
113 // Found sentences
114 fmt.Println(sentences)
115
116 // Get the first 10 sentences, start from 5th sentence.
117 sentences = textrank.FindSentencesFrom(tr, 5, 10)
118 // Found sentences
119 fmt.Println(sentences)
120
121 // Get sentences by phrase/word chains order by position in text.
122 sentencesPh := textrank.FindSentencesByPhraseChain(tr, []string{"gnome", "shell", "extension"})
123 // Found sentence.
124 fmt.Println(sentencesPh[0])
125 }
126
127Access to everything
128
129After ranking, the graph contains a lot of valuable data. The GetRank function
130allows access to the graph and every data can be retrieved from this structure.
131
132 package main
133
134 import (
135 "fmt"
136
137 "github.com/DavidBelicza/TextRank"
138 )
139
140 func main() {
141 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
142 // TextRank object
143 tr := textrank.NewTextRank()
144 // Default Rule for parsing.
145 rule := textrank.NewDefaultRule()
146 // Default Language for filtering stop words.
147 language := textrank.NewDefaultLanguage()
148 // Default algorithm for ranking text.
149 algorithmDef := textrank.NewDefaultAlgorithm()
150
151 // Add text.
152 tr.Populate(rawText, language, rule)
153 // Run the ranking.
154 tr.Ranking(algorithmDef)
155
156 // Get the rank graph.
157 rankData := tr.GetRankData()
158
159 // Get word ID by token/word.
160 wordId := rankData.WordValID["gnome"]
161
162 // Word's weight.
163 fmt.Println(rankData.Words[wordId].Weight)
164 // Word's quantity/occurrence.
165 fmt.Println(rankData.Words[wordId].Qty)
166 // All sentences what contain the this word.
167 fmt.Println(rankData.Words[wordId].SentenceIDs)
168 // All other words what are related to this word on left side.
169 fmt.Println(rankData.Words[wordId].ConnectionLeft)
170 // All other words what are related to this word on right side.
171 fmt.Println(rankData.Words[wordId].ConnectionRight)
172 // The node of this word, it contains the related words and the
173 // relation weight.
174 fmt.Println(rankData.Relation.Node[wordId])
175 }
176
177Adding text continuously:
178
179It is possibe to add more text after another texts already have been added. The
180Ranking function can merge these multiple texts and it can recalculate the
181weights and all related data.
182
183 package main
184
185 import (
186 "fmt"
187
188 "github.com/DavidBelicza/TextRank"
189 )
190
191 func main() {
192 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
193 // TextRank object
194 tr := textrank.NewTextRank()
195 // Default Rule for parsing.
196 rule := textrank.NewDefaultRule()
197 // Default Language for filtering stop words.
198 language := textrank.NewDefaultLanguage()
199 // Default algorithm for ranking text.
200 algorithmDef := textrank.NewDefaultAlgorithm()
201
202 // Add text.
203 tr.Populate(rawText, language, rule)
204 // Run the ranking.
205 tr.Ranking(algorithmDef)
206
207 rawText2 := "Another book or article..."
208 rawText3 := "Third another book or article..."
209
210 // Add text to the previously added text.
211 tr.Populate(rawText2, language, rule)
212 // Add text to the previously added text.
213 tr.Populate(rawText3, language, rule)
214 // Run the ranking to the whole composed text.
215 tr.Ranking(algorithmDef)
216
217 // Get all phrases by weight.
218 rankedPhrases := textrank.FindPhrases(tr)
219
220 // Most important phrase.
221 fmt.Println(rankedPhrases[0])
222 // Second important phrase.
223 fmt.Println(rankedPhrases[1])
224 }
225
226Using different algorithm to ranking text:
227
228There are two algorithm has implemented, it is possible to write custom
229algorithm by Algorithm interface and use it instead of defaults.
230
231 package main
232
233 import (
234 "fmt"
235
236 "github.com/DavidBelicza/TextRank"
237 )
238
239 func main() {
240 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
241 // TextRank object
242 tr := textrank.NewTextRank()
243 // Default Rule for parsing.
244 rule := textrank.NewDefaultRule()
245 // Default Language for filtering stop words.
246 language := textrank.NewDefaultLanguage()
247 // Using a little bit more complex algorithm to ranking text.
248 algorithmChain := textrank.NewChainAlgorithm()
249
250 // Add text.
251 tr.Populate(rawText, language, rule)
252 // Run the ranking.
253 tr.Ranking(algorithmChain)
254
255 // Get all phrases by weight.
256 rankedPhrases := textrank.FindPhrases(tr)
257
258 // Most important phrase.
259 fmt.Println(rankedPhrases[0])
260 // Second important phrase.
261 fmt.Println(rankedPhrases[1])
262 }
263
264Using multiple graphs:
265
266Graph ID exists because it is possible run multiple independent text ranking
267processes.
268
269 package main
270
271 import (
272 "fmt"
273
274 "github.com/DavidBelicza/TextRank"
275 )
276
277 func main() {
278 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
279 // 1th TextRank object
280 tr1 := textrank.NewTextRank()
281 // Default Rule for parsing.
282 rule := textrank.NewDefaultRule()
283 // Default Language for filtering stop words.
284 language := textrank.NewDefaultLanguage()
285 // Default algorithm for ranking text.
286 algorithmDef := textrank.NewDefaultAlgorithm()
287
288 // Add text.
289 tr1.Populate(rawText, language, rule)
290 // Run the ranking.
291 tr1.Ranking(algorithmDef)
292
293 // 2nd TextRank object
294 tr2 := textrank.NewTextRank()
295
296 // Using a little bit more complex algorithm to ranking text.
297 algorithmChain := textrank.NewChainAlgorithm()
298
299 // Add text to the second graph.
300 tr2.Populate(rawText, language, rule)
301 // Run the ranking on the second graph.
302 tr2.Ranking(algorithmChain)
303
304 // Get all phrases by weight from first graph.
305 rankedPhrases := textrank.FindPhrases(tr1)
306
307 // Most important phrase from first graph.
308 fmt.Println(rankedPhrases[0])
309 // Second important phrase from first graph.
310 fmt.Println(rankedPhrases[1])
311
312 // Get all phrases by weight from second graph.
313 rankedPhrases2 := textrank.FindPhrases(tr2)
314
315 // Most important phrase from second graph.
316 fmt.Println(rankedPhrases2[0])
317 // Second important phrase from second graph.
318 fmt.Println(rankedPhrases2[1])
319 }
320
321Using different non-English languages:
322
323Engish is used by default but it is possible to add any language. To use other
324languages a stop word list is required what you can find here:
325https://github.com/stopwords-iso
326
327 package main
328
329 import (
330 "fmt"
331
332 "github.com/DavidBelicza/TextRank"
333 )
334
335 func main() {
336 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
337 // TextRank object
338 tr := textrank.NewTextRank()
339 // Default Rule for parsing.
340 rule := textrank.NewDefaultRule()
341 // Default Language for filtering stop words.
342 language := textrank.NewDefaultLanguage()
343
344 // Add Spanish stop words (just some example).
345 language.SetWords("es", []string{"uno", "dos", "tres", "yo", "es", "eres"})
346 // Active the Spanish.
347 language.SetActiveLanguage("es")
348
349 // Default algorithm for ranking text.
350 algorithmDef := textrank.NewDefaultAlgorithm()
351
352 // Add text.
353 tr.Populate(rawText, language, rule)
354 // Run the ranking.
355 tr.Ranking(algorithmDef)
356
357 // Get all phrases by weight.
358 rankedPhrases := textrank.FindPhrases(tr)
359
360 // Most important phrase.
361 fmt.Println(rankedPhrases[0])
362 // Second important phrase.
363 fmt.Println(rankedPhrases[1])
364 }
365
366Asynchronous usage by goroutines:
367
368It is thread safe. Independent graphs can receive texts in the same time and can
369be extended by more text also in the same time.
370
371 package main
372
373 import (
374 "fmt"
375 "time"
376
377 "github.com/DavidBelicza/TextRank"
378 )
379
380 func main() {
381 // A flag when program has to stop.
382 stopProgram := false
383 // Channel.
384 stream := make(chan string)
385 // TextRank object.
386 tr := textrank.NewTextRank()
387
388 // Open new thread/routine
389 go func(tr *textrank.TextRank) {
390 // 3 texts.
391 rawTexts := []string{
392 "Very long text...",
393 "Another very long text...",
394 "Second another very long text...",
395 }
396
397 // Add 3 texts to the stream channel, one by one.
398 for _, rawText := range rawTexts {
399 stream <- rawText
400 }
401 }(tr)
402
403 // Open new thread/routine
404 go func() {
405 // Counter how many times texts added to the ranking.
406 i := 1
407
408 for {
409 // Get text from stream channel when it got a new one.
410 rawText := <-stream
411
412 // Default Rule for parsing.
413 rule := textrank.NewDefaultRule()
414 // Default Language for filtering stop words.
415 language := textrank.NewDefaultLanguage()
416 // Default algorithm for ranking text.
417 algorithm := textrank.NewDefaultAlgorithm()
418
419 // Add text.
420 tr.Populate(rawText, language, rule)
421 // Run the ranking.
422 tr.Ranking(algorithm)
423
424 // Set stopProgram flag to true when all 3 text have been added.
425 if i == 3 {
426 stopProgram = true
427 }
428
429 i++
430 }
431 }()
432
433 // The main thread has to run while go-routines run. When stopProgram is
434 // true then the loop has finish.
435 for !stopProgram {
436 time.Sleep(time.Second * 1)
437 }
438
439 // Most important phrase.
440 phrases := textrank.FindPhrases(tr)
441 // Second important phrase.
442 fmt.Println(phrases[0])
443 }
444*/
445package textrank
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh b/vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh
new file mode 100644
index 0000000..84e2d1b
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh
@@ -0,0 +1,4 @@
1#!/usr/bin/env bash
2
3go mod vendor
4go test ./...
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go
new file mode 100644
index 0000000..0f6ec91
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go
@@ -0,0 +1,52 @@
1package parse
2
3// Rule interface and its methods make possible the polimorf usage of process
4// how Rule retrieve tokens from text.
5type Rule interface {
6 IsWordSeparator(rune rune) bool
7 IsSentenceSeparator(rune rune) bool
8}
9
10// RuleDefault struct implements the Rule interface. It contains the separator
11// characters and can decide a character is separator or not.
12type RuleDefault struct {
13 wordSeparators [21]string
14 sentenceSeparators [3]string
15}
16
17// NewRule constructor retrieves a RuleDefault pointer.
18func NewRule() *RuleDefault {
19 return &RuleDefault{
20 [21]string{" ", ",", "'", "’", "\"", ")", "(", "[", "]", "{", "}", "\"", ";", "\n", ">", "<", "%", "@", "&", "=", "#"},
21 [3]string{"!", ".", "?"},
22 }
23}
24
25// IsWordSeparator method retrieves true when a character is a kind of special
26// character and possibly it separates to words from each other. It also checks
27// for sentence separator by IsSentenceSeparator method.
28func (r *RuleDefault) IsWordSeparator(rune rune) bool {
29 chr := string(rune)
30
31 for _, val := range r.wordSeparators {
32 if chr == val {
33 return true
34 }
35 }
36
37 return r.IsSentenceSeparator(rune)
38}
39
40// IsSentenceSeparator method retrieves true when a character is a kind of
41// special character and possibly it separates to words from each other.
42func (r *RuleDefault) IsSentenceSeparator(rune rune) bool {
43 chr := string(rune)
44
45 for _, val := range r.sentenceSeparators {
46 if chr == val {
47 return true
48 }
49 }
50
51 return false
52}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go
new file mode 100644
index 0000000..aab27c3
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go
@@ -0,0 +1,44 @@
1package parse
2
3// Text struct contains a parsed text.
4type Text struct {
5 parsedSentences []ParsedSentence
6}
7
8// ParsedSentence struct contains the original raw sentences and their words.
9type ParsedSentence struct {
10 original string
11 words []string
12}
13
14// Append method creates a sentence and its words and append them to the Text
15// object.
16func (text *Text) Append(rawSentence string, words []string) {
17 if len(words) > 0 {
18 parsedSentence := ParsedSentence{
19 original: rawSentence,
20 words: words,
21 }
22
23 text.parsedSentences = append(
24 text.parsedSentences,
25 parsedSentence,
26 )
27 }
28}
29
30// GetSentences method returns ParsedSentence slice from Text struct.
31func (text *Text) GetSentences() []ParsedSentence {
32 return text.parsedSentences
33}
34
35// GetWords methods returns the words string slice of ParsedSentence struct.
36func (parsedSentence *ParsedSentence) GetWords() []string {
37 return parsedSentence.words
38}
39
40// GetOriginal method returns the original sentence as a string from a
41// ParsedSentence struct.
42func (parsedSentence *ParsedSentence) GetOriginal() string {
43 return parsedSentence.original
44}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
new file mode 100644
index 0000000..003460e
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go
@@ -0,0 +1,63 @@
1package parse
2
3import (
4 "strings"
5)
6
7// TokenizeText function use the given raw text and parses by a Rule object and
8// retrieves the parsed text in a Text struct object.
9func TokenizeText(rawText string, rule Rule) Text {
10 return findSentences(rawText, rule)
11}
12
13func findSentences(rawText string, rule Rule) Text {
14 text := Text{}
15
16 var sentence string
17 var i int
18 slen := len(rawText)
19
20 for j, chr := range rawText {
21 j += len(string(chr))
22 //when separator or the last
23 if rule.IsSentenceSeparator(chr) || j == slen {
24 sentence = rawText[i:j]
25 if len(sentence) > 0 {
26 text.Append(sentence, findWords(sentence, rule))
27 }
28
29 sentence = ""
30 i = j
31 }
32 }
33
34 return text
35}
36
37func findWords(rawSentence string, rule Rule) (words []string) {
38 words = []string{}
39
40 var word string
41 var i int
42 slen := len(rawSentence)
43
44 for j, chr := range rawSentence {
45 chrlen := len(string(chr))
46 j += chrlen
47 //when separator or the last
48 if sep := rule.IsWordSeparator(chr); sep || j == slen {
49 if sep {
50 word = rawSentence[i : j-chrlen]
51 } else {
52 word = rawSentence[i:j]
53 }
54 if len(word) > 0 {
55 words = append(words, strings.ToLower(word))
56 }
57 word = ""
58 i = j
59 }
60 }
61
62 return
63}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go
new file mode 100644
index 0000000..8f9345f
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go
@@ -0,0 +1,99 @@
1package rank
2
3import (
4 "math"
5)
6
7// Algorithm interface and its methods make possible the polimorf usage of
8// weighting process.
9type Algorithm interface {
10 WeightingRelation(
11 word1ID int,
12 word2ID int,
13 rank *Rank,
14 ) float32
15
16 WeightingHits(
17 wordID int,
18 rank *Rank,
19 ) float32
20}
21
22// AlgorithmDefault struct is the basic implementation of Algorithm. It can
23// weight a word or phrase by comparing them.
24type AlgorithmDefault struct{}
25
26// NewAlgorithmDefault constructor retrieves an AlgorithmDefault pointer.
27func NewAlgorithmDefault() *AlgorithmDefault {
28 return &AlgorithmDefault{}
29}
30
31// WeightingRelation method is the traditional algorithm of text rank to
32// weighting a phrase.
33func (a *AlgorithmDefault) WeightingRelation(
34 word1ID int,
35 word2ID int,
36 rank *Rank,
37) float32 {
38 relationQty := rank.Relation.Node[word1ID][word2ID].Qty
39
40 return float32(relationQty)
41}
42
43// WeightingHits method ranks the words by their occurrence.
44func (a *AlgorithmDefault) WeightingHits(
45 wordID int,
46 rank *Rank,
47) float32 {
48 weight := rank.Words[wordID].Qty
49
50 return float32(weight)
51}
52
53// AlgorithmChain struct is the combined implementation of Algorithm. It is a
54// good example how weighting can be changed by a different implementations. It
55// can weight a word or phrase by comparing them.
56type AlgorithmChain struct{}
57
58// NewAlgorithmChain constructor retrieves an AlgorithmChain pointer.
59func NewAlgorithmChain() *AlgorithmChain {
60 return &AlgorithmChain{}
61}
62
63// WeightingRelation method is a combined algorithm of text rank and word
64// occurrence, it weights a phrase.
65func (a *AlgorithmChain) WeightingRelation(
66 word1ID int,
67 word2ID int,
68 rank *Rank,
69) float32 {
70 relationQty := rank.Relation.Node[word1ID][word2ID].Qty
71 word1Qty := rank.Words[word1ID].Qty
72 word2Qty := rank.Words[word2ID].Qty
73
74 qDiff := float32(math.Abs(float64(word1Qty)-float64(word2Qty))) / 100
75 weight := float32(relationQty) + qDiff
76
77 return weight
78}
79
80// WeightingHits method ranks the words by their occurrence.
81func (a *AlgorithmChain) WeightingHits(
82 wordID int,
83 rank *Rank,
84) float32 {
85 word := rank.Words[wordID]
86 qty := 0
87
88 for leftWordID, leftWordQty := range word.ConnectionLeft {
89 qty += rank.Words[leftWordID].Qty * leftWordQty
90 }
91
92 for rightWordID, rightWordQty := range word.ConnectionRight {
93 qty += rank.Words[rightWordID].Qty * rightWordQty
94 }
95
96 weight := float32(word.Qty) + (float32(qty))
97
98 return float32(weight)
99}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go
new file mode 100644
index 0000000..3bcef7c
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go
@@ -0,0 +1,147 @@
1package rank
2
3// Rank struct contains every original raw sentences, words, tokens, phrases,
4// indexes, word hits, phrase hits and minimum-maximum values.
5//
6// Max is the occurrence of the most used word.
7//
8// Min is the occurrence of the less used word. It is always greater then 0.
9//
10// Relation is the Relation object, contains phrases.
11//
12// SentenceMap contains raw sentences. Index is the sentence ID, value is the
13// sentence itself.
14//
15// Words contains Word objects. Index is the word ID, value is the word/token
16// itself.
17//
18// WordValID contains words. Index is the word/token, value is the ID.
19type Rank struct {
20 Max float32
21 Min float32
22 Relation Relation
23 SentenceMap map[int]string
24 Words map[int]*Word
25 WordValID map[string]int
26}
27
28// Word struct contains all data about the words.
29//
30// If a word is multiple times in the text then the multiple words point to the
31// same ID. So Word is unique.
32//
33// SentenceIDs contains all IDs of sentences what contain the word.
34//
35// ConnectionLeft contains all words what are connected to this word on the left
36// side. The map index is the ID of the related word and its value is the
37// occurrence.
38//
39// ConnectionRight contains all words what are connected to this word on the
40// right side. The map index is the ID of the related word and its value is the
41// occurrence.
42//
43// Token is the word itself, but not the original, it is tokenized.
44//
45// Qty is the number of occurrence of the word.
46//
47// Weight is the weight of the word between 0.00 and 1.00.
48type Word struct {
49 ID int
50 SentenceIDs []int
51 ConnectionLeft map[int]int
52 ConnectionRight map[int]int
53 Token string
54 Qty int
55 Weight float32
56}
57
58// NewRank constructor retrieves a Rank pointer.
59func NewRank() *Rank {
60 return &Rank{
61 0,
62 0,
63 Relation{
64 0,
65 0,
66 make(map[int]map[int]Score),
67 },
68 make(map[int]string),
69 make(map[int]*Word),
70 make(map[string]int),
71 }
72}
73
74// IsWordExist method retrieves true when the given word is already in the rank.
75func (rank *Rank) IsWordExist(word string) bool {
76 _, find := rank.WordValID[word]
77
78 return find
79}
80
81// AddNewWord method adds a new word to the rank object and it defines its ID.
82func (rank *Rank) AddNewWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
83 wordID = len(rank.Words)
84 connectionLeft := make(map[int]int)
85
86 if prevWordIdx >= 0 {
87 connectionLeft[prevWordIdx] = 1
88 }
89
90 newWord := &Word{
91 ID: wordID,
92 SentenceIDs: []int{sentenceID},
93 ConnectionLeft: connectionLeft,
94 ConnectionRight: make(map[int]int),
95 Token: word,
96 Qty: 1,
97 Weight: 0,
98 }
99
100 rank.Words[wordID] = newWord
101 rank.WordValID[word] = wordID
102
103 return
104}
105
106// UpdateWord method update a word what already exists in the rank object. It
107// retrieves its ID.
108func (rank *Rank) UpdateWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
109 wordID = rank.WordValID[word]
110
111 found := false
112
113 for _, oldSentenceID := range rank.Words[wordID].SentenceIDs {
114 if sentenceID == oldSentenceID {
115 found = true
116 break
117 }
118 }
119
120 if !found {
121 rank.Words[wordID].SentenceIDs = append(
122 rank.Words[wordID].SentenceIDs,
123 sentenceID,
124 )
125 }
126
127 rank.Words[wordID].Qty++
128
129 if prevWordIdx >= 0 {
130 rank.Words[wordID].ConnectionLeft[prevWordIdx]++
131 }
132
133 return
134}
135
136// UpdateRightConnection method adds the right connection to the word. It always
137// can be used after a word has added and the next word is known.
138func (rank *Rank) UpdateRightConnection(wordID int, rightWordID int) {
139 if wordID >= 0 {
140 rank.Words[wordID].ConnectionRight[rightWordID]++
141 }
142}
143
144// GetWordData method retrieves all words as a pointer.
145func (rank *Rank) GetWordData() map[int]*Word {
146 return rank.Words
147}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go
new file mode 100644
index 0000000..5fd2dfa
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go
@@ -0,0 +1,66 @@
1package rank
2
3// Calculate function ranking words by the given algorithm implementation.
4func Calculate(ranks *Rank, algorithm Algorithm) {
5 updateRanks(ranks, algorithm)
6}
7
8func updateRanks(ranks *Rank, algorithm Algorithm) {
9 for _, word := range ranks.Words {
10 weight := algorithm.WeightingHits(word.ID, ranks)
11 word.Weight = weight
12
13 if ranks.Max < word.Weight {
14 ranks.Max = word.Weight
15 }
16
17 if ranks.Min > word.Weight || ranks.Min == 0 {
18 ranks.Min = word.Weight
19 }
20 }
21
22 for _, word := range ranks.Words {
23 word.Weight = normalize(word.Weight, ranks.Min, ranks.Max)
24 }
25
26 for x, xMap := range ranks.Relation.Node {
27 for y := range xMap {
28 sentenceIDs := ranks.Relation.Node[x][y].SentenceIDs
29 weight := algorithm.WeightingRelation(x, y, ranks)
30
31 ranks.Relation.Node[x][y] = Score{
32 ranks.Relation.Node[x][y].Qty,
33 weight,
34 sentenceIDs,
35 }
36
37 if ranks.Relation.Max < weight {
38 ranks.Relation.Max = weight
39 }
40
41 if ranks.Relation.Min > weight || ranks.Relation.Min == 0 {
42 ranks.Relation.Min = weight
43 }
44 }
45 }
46
47 for x, xMap := range ranks.Relation.Node {
48 for y := range xMap {
49 weight := normalize(
50 ranks.Relation.Node[x][y].Weight,
51 ranks.Relation.Min,
52 ranks.Relation.Max,
53 )
54
55 ranks.Relation.Node[x][y] = Score{
56 ranks.Relation.Node[x][y].Qty,
57 weight,
58 ranks.Relation.Node[x][y].SentenceIDs,
59 }
60 }
61 }
62}
63
64func normalize(weight float32, min float32, max float32) float32 {
65 return (weight - min) / (max - min)
66}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go
new file mode 100644
index 0000000..cb8b97e
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go
@@ -0,0 +1,77 @@
1package rank
2
3// Relation struct contains the phrase data.
4//
5// Max is the occurrence of the most used phrase.
6//
7// Min is the occurrence of the less used phrase. It is always greater then 0.
8//
9// Node is contains the Scores. Firs ID is the word 1, second ID is the word 2,
10// and the value is the Score what contains the data about their relation.
11type Relation struct {
12 Max float32
13 Min float32
14 Node map[int]map[int]Score
15}
16
17// Score struct contains data about a relation of two words.
18//
19// Qty is the occurrence of the phrase.
20//
21// Weight is the weight of the phrase between 0.00 and 1.00.
22//
23// SentenceIDs contains all IDs of sentences what contain the phrase.
24type Score struct {
25 Qty int
26 Weight float32
27 SentenceIDs []int
28}
29
30// AddRelation method adds a new relation to Relation object.
31func (relation *Relation) AddRelation(wordID int, relatedWordID int, sentenceID int) {
32 if relatedWordID == -1 {
33 return
34 }
35
36 if relation.updateRelation(relatedWordID, wordID, true, sentenceID) {
37 return
38 }
39
40 if relation.extendRelation(wordID, relatedWordID, true, sentenceID) {
41 return
42 }
43
44 relation.createRelation(wordID, relatedWordID, sentenceID)
45}
46
47func (relation *Relation) updateRelation(x int, y int, r bool, sentenceID int) bool {
48 if _, ok := relation.Node[x][y]; ok {
49 count := relation.Node[x][y].Qty + 1
50 weight := relation.Node[x][y].Weight
51 sentenceIDs := append(relation.Node[x][y].SentenceIDs, sentenceID)
52 relation.Node[x][y] = Score{count, weight, sentenceIDs}
53
54 return true
55 } else if r {
56 return relation.updateRelation(y, x, false, sentenceID)
57 }
58
59 return false
60}
61
62func (relation *Relation) extendRelation(x int, y int, r bool, sentenceID int) bool {
63 if _, ok := relation.Node[x]; ok {
64 relation.Node[x][y] = Score{1, 0, []int{sentenceID}}
65
66 return true
67 } else if r {
68 return relation.extendRelation(y, x, false, sentenceID)
69 }
70
71 return false
72}
73
74func (relation *Relation) createRelation(x int, y int, sentenceID int) {
75 relation.Node[x] = map[int]Score{}
76 relation.Node[x][y] = Score{1, 0, []int{sentenceID}}
77}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go
new file mode 100644
index 0000000..6d00a97
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go
@@ -0,0 +1,202 @@
1package rank
2
3import (
4 "sort"
5)
6
7// Phrase struct contains a single phrase and its data.
8//
9// LeftID is the ID of the word 1.
10//
11// RightID is the ID of the word 2.
12//
13// Left is the token of the word 1.
14//
15// Right is the token of the word 2.
16//
17// Weight is between 0.00 and 1.00.
18//
19// Qty is the occurrence of the phrase.
20type Phrase struct {
21 LeftID int
22 RightID int
23 Left string
24 Right string
25 Weight float32
26 Qty int
27}
28
29// FindPhrases function has wrapper textrank.FindPhrases. Use the wrapper
30// instead.
31func FindPhrases(ranks *Rank) []Phrase {
32 var phrases []Phrase
33
34 for x, xMap := range ranks.Relation.Node {
35 for y := range xMap {
36 phrases = append(phrases, Phrase{
37 ranks.Words[x].ID,
38 ranks.Words[y].ID,
39 ranks.Words[x].Token,
40 ranks.Words[y].Token,
41 ranks.Relation.Node[x][y].Weight,
42 ranks.Relation.Node[x][y].Qty,
43 })
44 }
45 }
46
47 sort.Slice(phrases, func(i, j int) bool {
48 return phrases[i].Weight > phrases[j].Weight
49 })
50
51 return phrases
52}
53
54// SingleWord struct contains a single word and its data.
55//
56// ID of the word.
57//
58// Word itself, the token.
59//
60// Weight of the word between 0.00 and 1.00.
61//
62// Quantity of the word.
63type SingleWord struct {
64 ID int
65 Word string
66 Weight float32
67 Qty int
68}
69
70// FindSingleWords function has wrapper textrank.FindSingleWords. Use the
71// wrapper instead.
72func FindSingleWords(ranks *Rank) []SingleWord {
73 var singleWords []SingleWord
74
75 for _, word := range ranks.Words {
76 singleWords = append(singleWords, SingleWord{
77 word.ID,
78 word.Token,
79 word.Weight,
80 word.Qty,
81 })
82 }
83
84 sort.Slice(singleWords, func(i, j int) bool {
85 return singleWords[i].Weight > singleWords[j].Weight
86 })
87
88 return singleWords
89}
90
91// Sentence struct contains a single sentence and its data.
92type Sentence struct {
93 ID int
94 Value string
95}
96
97// ByQty filter by occurrence of word.
98const ByQty = 0
99
100// ByRelation filter by phrase weight.
101const ByRelation = 1
102
103// FindSentences function has wrappers textrank.FindSentencesByRelationWeight
104// and textrank.FindSentencesByWordQtyWeight. Use the wrappers instead.
105func FindSentences(ranks *Rank, kind int, limit int) []Sentence {
106 var sentences []Sentence
107
108 cache := make(map[int]bool)
109
110 collect := func(sentenceIDs []int) bool {
111 for _, id := range sentenceIDs {
112 if len(sentences) >= limit {
113 return true
114 }
115
116 if !cache[id] {
117 sentences = append(sentences, Sentence{id, ranks.SentenceMap[id]})
118 cache[id] = true
119 }
120 }
121
122 return false
123 }
124
125 if kind == ByQty {
126 singleWords := FindSingleWords(ranks)
127
128 for _, singleWord := range singleWords {
129 sentenceIDs := ranks.Words[singleWord.ID].SentenceIDs
130
131 if collect(sentenceIDs) {
132 return sentences
133 }
134 }
135 } else if kind == ByRelation {
136 phrases := FindPhrases(ranks)
137
138 for _, phrase := range phrases {
139 sentenceIDs := ranks.Relation.Node[phrase.LeftID][phrase.RightID].SentenceIDs
140
141 if collect(sentenceIDs) {
142 return sentences
143 }
144 }
145 }
146
147 return sentences
148}
149
150// FindSentencesByPhrases function has wrapper
151// textrank.FindSentencesByPhraseChain. Use the wrapper instead.
152func FindSentencesByPhrases(ranks *Rank, words []string) []Sentence {
153 var sentences []Sentence
154
155 reqMatch := len(words) - 1
156 sentenceIDs := make(map[int]int)
157
158 for _, i := range words {
159 for _, j := range words {
160 x := ranks.WordValID[i]
161 y := ranks.WordValID[j]
162
163 if _, ok := ranks.Relation.Node[x][y]; ok {
164 curSentenceIDs := ranks.Relation.Node[x][y].SentenceIDs
165
166 for _, id := range curSentenceIDs {
167 if _, ok := sentenceIDs[id]; ok {
168 sentenceIDs[id]++
169 } else {
170 sentenceIDs[id] = 1
171 }
172 }
173 }
174 }
175 }
176
177 for sentenceID, v := range sentenceIDs {
178 if v >= reqMatch {
179 sentences = append(sentences, Sentence{sentenceID, ranks.SentenceMap[sentenceID]})
180 }
181 }
182
183 sort.Slice(sentences, func(i, j int) bool {
184 return sentences[i].ID < sentences[j].ID
185 })
186
187 return sentences
188}
189
190// FindSentencesFrom function has wrapper textrank.FindSentencesFrom. Use the
191// wrapper instead.
192func FindSentencesFrom(ranks *Rank, id int, limit int) []Sentence {
193 var sentences []Sentence
194
195 limit = id + limit - 1
196
197 for i := id; i <= limit; i++ {
198 sentences = append(sentences, Sentence{i, ranks.SentenceMap[i]})
199 }
200
201 return sentences
202}
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go
new file mode 100644
index 0000000..ed48ce3
--- /dev/null
+++ b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go
@@ -0,0 +1,194 @@
1package textrank
2
3import (
4 "github.com/DavidBelicza/TextRank/v2/convert"
5 "github.com/DavidBelicza/TextRank/v2/parse"
6 "github.com/DavidBelicza/TextRank/v2/rank"
7)
8
9// TextRank structure contains the Rank data object. This structure is a wrapper
10// around the whole text ranking functionality.
11type TextRank struct {
12 rank *rank.Rank
13}
14
15// NewTextRank constructor retrieves a TextRank pointer. This is the 1th step to
16// use TextRank.
17func NewTextRank() *TextRank {
18 return &TextRank{
19 rank.NewRank(),
20 }
21}
22
23// NewDefaultRule function retrieves a default Rule object what works in the
24// most cases in English or similar Latin languages like French or Spanish. The
25// Rule defines raw text how should be split to sentences and words. Because
26// Rule is an interface it's possible modify the ranking by inject different
27// Rule implementation. This is the 2nd step to use TextRank.
28func NewDefaultRule() *parse.RuleDefault {
29 return parse.NewRule()
30}
31
32// NewDefaultLanguage function retrieves a default Language object. It defines
33// what words are real and what words are just Stop Words or useless Junk Words.
34// It uses the default English Stop Words, but it's possible to set different
35// Stop Words in English or any other languages. Because Language is an
36// interface it's possible to modify the ranking by inject different Language
37// implementation. This is the 3rd step to use TextRank.
38func NewDefaultLanguage() *convert.LanguageDefault {
39 return convert.NewLanguage()
40}
41
42// NewDefaultAlgorithm function retrieves an Algorithm object. It defines how
43// should work the text ranking algorithm, the weighting. This is the general
44// text rank by weighting the connection between the words to find the strongest
45// phrases. Because Algorithm is an interface it's possible to modify the
46// ranking algorithm by inject different implementation. This is the 4th step to
47// use TextRank.
48func NewDefaultAlgorithm() *rank.AlgorithmDefault {
49 return rank.NewAlgorithmDefault()
50}
51
52// NewChainAlgorithm function retrieves an Algorithm object. It defines how
53// should work the text ranking algorithm, the weighting. This is an alternative
54// way to ranking words by weighting the number of the words. Because Algorithm
55// is an interface it's possible to modify the ranking algorithm by inject
56// different implementation. This is the 4th step to use TextRank.
57func NewChainAlgorithm() *rank.AlgorithmChain {
58 return rank.NewAlgorithmChain()
59}
60
61// Populate method adds a raw text to the text-ranking graph. It parses,
62// tokenize the raw text and prepares it to weighting and scoring. It's possible
63// to append a new raw text to an existing one even if the previously text is
64// already ranked. This is 5th step to use TextRank.
65//
66// text string must be a plain text from TXT or PDF or any document, it can
67// contain new lines, break lines or any unnecessary text parts, but it should
68// not contain HTML tags or codes.
69//
70// lang Language object can be loaded from NewDefaultLanguage function.
71//
72// rule Rule object can be loaded from NewDefaultRule function.
73func (textRank *TextRank) Populate(
74 text string,
75 lang convert.Language,
76 rule parse.Rule,
77) {
78 parsedText := parse.TokenizeText(text, rule)
79
80 for _, sentence := range parsedText.GetSentences() {
81 convert.TextToRank(sentence, lang, textRank.rank)
82 }
83}
84
85// Ranking method counts the words and connections between the words, then it
86// weights the numbers then normalize them in type float32 between 0.00 and
87// 1.00. This is the 6th step to use TextRank.
88//
89// algorithm Algorithm is the object of the weighting and scoring methods.
90func (textRank *TextRank) Ranking(algorithm rank.Algorithm) {
91 rank.Calculate(textRank.rank, algorithm)
92}
93
94// GetRankData method retrieves the Rank data to that case if the developer want
95// access to the whole graph and sentences, words, weights and all of the data
96// to analyze it or just implement a new search logic or finder method.
97func (textRank *TextRank) GetRankData() *rank.Rank {
98 return textRank.rank
99}
100
101// FindPhrases function retrieves a slice of Phrase structures by TextRank
102// object. The return value contains the sorted phrases with IDs, words, weights
103// and quantities by weight from 1 to 0. Weight is calculated from quantities of
104// relation between two words. A single phrase is from two words - not less and
105// more. (But it's possible to find chain of phrases by
106// FindSentencesByPhraseChain function.)
107func FindPhrases(textRank *TextRank) []rank.Phrase {
108 return rank.FindPhrases(textRank.rank)
109}
110
111// FindSingleWords function retrieves a slice of SingleWord structures by
112// TextRank object. The return value contains the sorted words with IDs, words,
113// weights and quantities by weight from 1 to 0. Weight is calculated from
114// quantities of word.
115func FindSingleWords(textRank *TextRank) []rank.SingleWord {
116 return rank.FindSingleWords(textRank.rank)
117}
118
119// FindSentencesByRelationWeight function retrieves a slice of Sentence
120// structures by TextRank object. The return value contains the ID of the
121// sentence and the sentence text itself. The slice is sorted by weight of
122// phrases from 1 to 0.
123func FindSentencesByRelationWeight(
124 textRank *TextRank,
125 limit int,
126) []rank.Sentence {
127
128 return rank.FindSentences(textRank.rank, rank.ByRelation, limit)
129}
130
131// FindSentencesByWordQtyWeight function retrieves a slice of Sentence
132// structures by TextRank object. The return value contains the ID of the
133// sentence and the sentence text itself. The slice is sorted by weight of word
134// quantities from 1 to 0.
135func FindSentencesByWordQtyWeight(
136 textRank *TextRank,
137 limit int,
138) []rank.Sentence {
139
140 return rank.FindSentences(textRank.rank, rank.ByQty, limit)
141}
142
143// FindSentencesByPhraseChain function retrieves a slice of Sentence structures
144// by TextRank object and slice of phrases. The return value contains the ID of
145// the sentence and the sentence text itself. The slice is sorted by weight of
146// word quantities from 1 to 0.
147//
148// textRank TextRank is the object of the TextRank.
149//
150// phrases []string is a slice of phrases. A single phrase is from two words, so
151// when the slice contains 3 words the inner method will search for two phrases.
152// The search algorithm seeks for "len(phrases)!". In case of three item the
153// possible combination is 3 factorial (3!) = 3 * 2 * 1.
154//
155// rawText := "Long raw text, lorem ipsum..."
156// rule := NewDefaultRule()
157// language := NewDefaultLanguage()
158// algorithm := NewDefaultAlgorithm()
159//
160// Append(rawText, language, rule, 1)
161// Ranking(1, algorithm)
162//
163// FindSentencesByPhraseChain(1, []string{
164// "captain",
165// "james",
166// "kirk",
167// })
168//
169// The above code searches for captain james kirk, captain kirk james, james
170// kirk captain, james captain kirk, kirk james captain and james kirk captain
171// combinations in the graph. The 3 of words have to be related to each other
172// in the same sentence but the search algorithm ignores the stop words. So if
173// there is a sentence "James Kirk is the Captain of the Enterprise." the
174// sentence will be returned because the words "is" and "the" are stop words.
175func FindSentencesByPhraseChain(
176 textRank *TextRank,
177 phrases []string,
178) []rank.Sentence {
179
180 return rank.FindSentencesByPhrases(textRank.rank, phrases)
181}
182
183// FindSentencesFrom function retrieves a slice of Sentence structures by
184// TextRank object and by ID of the sentence. The return value contains the
185// sentence text itself. The returned slice contains sentences sorted by their
186// IDs started from the given sentence ID in ascending sort.
187func FindSentencesFrom(
188 textRank *TextRank,
189 sentenceID int,
190 limit int,
191) []rank.Sentence {
192
193 return rank.FindSentencesFrom(textRank.rank, sentenceID, limit)
194}