1/*
2Package textrank is an implementation of Text Rank algorithm in Go with
3extendable features (automatic summarization, phrase extraction). It supports
4multithreading by goroutines. The package is under The MIT Licence.
5
6MOTIVATION
7
8If there was a program what could rank book size text's words, phrases and
9sentences continuously on multiple threads and it would be opened to modifing by
10objects, written in a simple, secure, static language and if it would be very
11well documented... Now, here it is.
12
13FEATURES
14
15- Find the most important phrases.
16- Find the most important words.
17- Find the most important N sentences.
18- Importance by phrase weights.
19- Importance by word occurrence.
20- Find the first N sentences, start from Xth sentence.
21- Find sentences by phrase chains ordered by position in text.
22- Access to the whole ranked data.
23- Support more languages.
24- Algorithm for weighting can be modified by interface implementation.
25- Parser can be modified by interface implementation.
26- Multi thread support.
27
28EXAMPLES
29
30Find the most important phrases:
31
32This is the most basic and simplest usage of textrank.
33
34 package main
35
36 import (
37 "fmt"
38
39 "github.com/DavidBelicza/TextRank"
40 )
41
42 func main() {
43 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
44 // TextRank object
45 tr := textrank.NewTextRank()
46 // Default Rule for parsing.
47 rule := textrank.NewDefaultRule()
48 // Default Language for filtering stop words.
49 language := textrank.NewDefaultLanguage()
50 // Default algorithm for ranking text.
51 algorithmDef := textrank.NewDefaultAlgorithm()
52
53 // Add text.
54 tr.Populate(rawText, language, rule)
55 // Run the ranking.
56 tr.Ranking(algorithmDef)
57
58 // Get all phrases by weight.
59 rankedPhrases := textrank.FindPhrases(tr)
60
61 // Most important phrase.
62 fmt.Println(rankedPhrases[0])
63 // Second important phrase.
64 fmt.Println(rankedPhrases[1])
65 }
66
67All possible pre-defined finder queries:
68
69After ranking, the graph contains a lot of valuable data. There are functions in
70textrank package what contains logic to retrieve those data from the graph.
71
72 package main
73
74 import (
75 "fmt"
76
77 "github.com/DavidBelicza/TextRank"
78 )
79
80 func main() {
81 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
82 // TextRank object
83 tr := textrank.NewTextRank()
84 // Default Rule for parsing.
85 rule := textrank.NewDefaultRule()
86 // Default Language for filtering stop words.
87 language := textrank.NewDefaultLanguage()
88 // Default algorithm for ranking text.
89 algorithmDef := textrank.NewDefaultAlgorithm()
90
91 // Add text.
92 tr.Populate(rawText, language, rule)
93 // Run the ranking.
94 tr.Ranking(algorithmDef)
95
96 // Get all phrases order by weight.
97 rankedPhrases := textrank.FindPhrases(tr)
98 // Most important phrase.
99 fmt.Println(rankedPhrases[0])
100
101 // Get all words order by weight.
102 words := textrank.FindSingleWords(tr)
103 // Most important word.
104 fmt.Println(words[0])
105
106 // Get the most important 10 sentences. Importance by phrase weights.
107 sentences := textrank.FindSentencesByRelationWeight(tr, 10)
108 // Found sentences
109 fmt.Println(sentences)
110
111 // Get the most important 10 sentences. Importance by word occurrence.
112 sentences = textrank.FindSentencesByWordQtyWeight(tr, 10)
113 // Found sentences
114 fmt.Println(sentences)
115
116 // Get the first 10 sentences, start from 5th sentence.
117 sentences = textrank.FindSentencesFrom(tr, 5, 10)
118 // Found sentences
119 fmt.Println(sentences)
120
121 // Get sentences by phrase/word chains order by position in text.
122 sentencesPh := textrank.FindSentencesByPhraseChain(tr, []string{"gnome", "shell", "extension"})
123 // Found sentence.
124 fmt.Println(sentencesPh[0])
125 }
126
127Access to everything
128
129After ranking, the graph contains a lot of valuable data. The GetRank function
130allows access to the graph and every data can be retrieved from this structure.
131
132 package main
133
134 import (
135 "fmt"
136
137 "github.com/DavidBelicza/TextRank"
138 )
139
140 func main() {
141 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
142 // TextRank object
143 tr := textrank.NewTextRank()
144 // Default Rule for parsing.
145 rule := textrank.NewDefaultRule()
146 // Default Language for filtering stop words.
147 language := textrank.NewDefaultLanguage()
148 // Default algorithm for ranking text.
149 algorithmDef := textrank.NewDefaultAlgorithm()
150
151 // Add text.
152 tr.Populate(rawText, language, rule)
153 // Run the ranking.
154 tr.Ranking(algorithmDef)
155
156 // Get the rank graph.
157 rankData := tr.GetRankData()
158
159 // Get word ID by token/word.
160 wordId := rankData.WordValID["gnome"]
161
162 // Word's weight.
163 fmt.Println(rankData.Words[wordId].Weight)
164 // Word's quantity/occurrence.
165 fmt.Println(rankData.Words[wordId].Qty)
166 // All sentences what contain the this word.
167 fmt.Println(rankData.Words[wordId].SentenceIDs)
168 // All other words what are related to this word on left side.
169 fmt.Println(rankData.Words[wordId].ConnectionLeft)
170 // All other words what are related to this word on right side.
171 fmt.Println(rankData.Words[wordId].ConnectionRight)
172 // The node of this word, it contains the related words and the
173 // relation weight.
174 fmt.Println(rankData.Relation.Node[wordId])
175 }
176
177Adding text continuously:
178
179It is possibe to add more text after another texts already have been added. The
180Ranking function can merge these multiple texts and it can recalculate the
181weights and all related data.
182
183 package main
184
185 import (
186 "fmt"
187
188 "github.com/DavidBelicza/TextRank"
189 )
190
191 func main() {
192 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
193 // TextRank object
194 tr := textrank.NewTextRank()
195 // Default Rule for parsing.
196 rule := textrank.NewDefaultRule()
197 // Default Language for filtering stop words.
198 language := textrank.NewDefaultLanguage()
199 // Default algorithm for ranking text.
200 algorithmDef := textrank.NewDefaultAlgorithm()
201
202 // Add text.
203 tr.Populate(rawText, language, rule)
204 // Run the ranking.
205 tr.Ranking(algorithmDef)
206
207 rawText2 := "Another book or article..."
208 rawText3 := "Third another book or article..."
209
210 // Add text to the previously added text.
211 tr.Populate(rawText2, language, rule)
212 // Add text to the previously added text.
213 tr.Populate(rawText3, language, rule)
214 // Run the ranking to the whole composed text.
215 tr.Ranking(algorithmDef)
216
217 // Get all phrases by weight.
218 rankedPhrases := textrank.FindPhrases(tr)
219
220 // Most important phrase.
221 fmt.Println(rankedPhrases[0])
222 // Second important phrase.
223 fmt.Println(rankedPhrases[1])
224 }
225
226Using different algorithm to ranking text:
227
228There are two algorithm has implemented, it is possible to write custom
229algorithm by Algorithm interface and use it instead of defaults.
230
231 package main
232
233 import (
234 "fmt"
235
236 "github.com/DavidBelicza/TextRank"
237 )
238
239 func main() {
240 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
241 // TextRank object
242 tr := textrank.NewTextRank()
243 // Default Rule for parsing.
244 rule := textrank.NewDefaultRule()
245 // Default Language for filtering stop words.
246 language := textrank.NewDefaultLanguage()
247 // Using a little bit more complex algorithm to ranking text.
248 algorithmChain := textrank.NewChainAlgorithm()
249
250 // Add text.
251 tr.Populate(rawText, language, rule)
252 // Run the ranking.
253 tr.Ranking(algorithmChain)
254
255 // Get all phrases by weight.
256 rankedPhrases := textrank.FindPhrases(tr)
257
258 // Most important phrase.
259 fmt.Println(rankedPhrases[0])
260 // Second important phrase.
261 fmt.Println(rankedPhrases[1])
262 }
263
264Using multiple graphs:
265
266Graph ID exists because it is possible run multiple independent text ranking
267processes.
268
269 package main
270
271 import (
272 "fmt"
273
274 "github.com/DavidBelicza/TextRank"
275 )
276
277 func main() {
278 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
279 // 1th TextRank object
280 tr1 := textrank.NewTextRank()
281 // Default Rule for parsing.
282 rule := textrank.NewDefaultRule()
283 // Default Language for filtering stop words.
284 language := textrank.NewDefaultLanguage()
285 // Default algorithm for ranking text.
286 algorithmDef := textrank.NewDefaultAlgorithm()
287
288 // Add text.
289 tr1.Populate(rawText, language, rule)
290 // Run the ranking.
291 tr1.Ranking(algorithmDef)
292
293 // 2nd TextRank object
294 tr2 := textrank.NewTextRank()
295
296 // Using a little bit more complex algorithm to ranking text.
297 algorithmChain := textrank.NewChainAlgorithm()
298
299 // Add text to the second graph.
300 tr2.Populate(rawText, language, rule)
301 // Run the ranking on the second graph.
302 tr2.Ranking(algorithmChain)
303
304 // Get all phrases by weight from first graph.
305 rankedPhrases := textrank.FindPhrases(tr1)
306
307 // Most important phrase from first graph.
308 fmt.Println(rankedPhrases[0])
309 // Second important phrase from first graph.
310 fmt.Println(rankedPhrases[1])
311
312 // Get all phrases by weight from second graph.
313 rankedPhrases2 := textrank.FindPhrases(tr2)
314
315 // Most important phrase from second graph.
316 fmt.Println(rankedPhrases2[0])
317 // Second important phrase from second graph.
318 fmt.Println(rankedPhrases2[1])
319 }
320
321Using different non-English languages:
322
323Engish is used by default but it is possible to add any language. To use other
324languages a stop word list is required what you can find here:
325https://github.com/stopwords-iso
326
327 package main
328
329 import (
330 "fmt"
331
332 "github.com/DavidBelicza/TextRank"
333 )
334
335 func main() {
336 rawText := "Your long raw text, it could be a book. Lorem ipsum..."
337 // TextRank object
338 tr := textrank.NewTextRank()
339 // Default Rule for parsing.
340 rule := textrank.NewDefaultRule()
341 // Default Language for filtering stop words.
342 language := textrank.NewDefaultLanguage()
343
344 // Add Spanish stop words (just some example).
345 language.SetWords("es", []string{"uno", "dos", "tres", "yo", "es", "eres"})
346 // Active the Spanish.
347 language.SetActiveLanguage("es")
348
349 // Default algorithm for ranking text.
350 algorithmDef := textrank.NewDefaultAlgorithm()
351
352 // Add text.
353 tr.Populate(rawText, language, rule)
354 // Run the ranking.
355 tr.Ranking(algorithmDef)
356
357 // Get all phrases by weight.
358 rankedPhrases := textrank.FindPhrases(tr)
359
360 // Most important phrase.
361 fmt.Println(rankedPhrases[0])
362 // Second important phrase.
363 fmt.Println(rankedPhrases[1])
364 }
365
366Asynchronous usage by goroutines:
367
368It is thread safe. Independent graphs can receive texts in the same time and can
369be extended by more text also in the same time.
370
371 package main
372
373 import (
374 "fmt"
375 "time"
376
377 "github.com/DavidBelicza/TextRank"
378 )
379
380 func main() {
381 // A flag when program has to stop.
382 stopProgram := false
383 // Channel.
384 stream := make(chan string)
385 // TextRank object.
386 tr := textrank.NewTextRank()
387
388 // Open new thread/routine
389 go func(tr *textrank.TextRank) {
390 // 3 texts.
391 rawTexts := []string{
392 "Very long text...",
393 "Another very long text...",
394 "Second another very long text...",
395 }
396
397 // Add 3 texts to the stream channel, one by one.
398 for _, rawText := range rawTexts {
399 stream <- rawText
400 }
401 }(tr)
402
403 // Open new thread/routine
404 go func() {
405 // Counter how many times texts added to the ranking.
406 i := 1
407
408 for {
409 // Get text from stream channel when it got a new one.
410 rawText := <-stream
411
412 // Default Rule for parsing.
413 rule := textrank.NewDefaultRule()
414 // Default Language for filtering stop words.
415 language := textrank.NewDefaultLanguage()
416 // Default algorithm for ranking text.
417 algorithm := textrank.NewDefaultAlgorithm()
418
419 // Add text.
420 tr.Populate(rawText, language, rule)
421 // Run the ranking.
422 tr.Ranking(algorithm)
423
424 // Set stopProgram flag to true when all 3 text have been added.
425 if i == 3 {
426 stopProgram = true
427 }
428
429 i++
430 }
431 }()
432
433 // The main thread has to run while go-routines run. When stopProgram is
434 // true then the loop has finish.
435 for !stopProgram {
436 time.Sleep(time.Second * 1)
437 }
438
439 // Most important phrase.
440 phrases := textrank.FindPhrases(tr)
441 // Second important phrase.
442 fmt.Println(phrases[0])
443 }
444*/
445package textrank