1/*
  2Package textrank is an implementation of Text Rank algorithm in Go with
  3extendable features (automatic summarization, phrase extraction). It supports
  4multithreading by goroutines. The package is under The MIT Licence.
  5
  6MOTIVATION
  7
  8If there was a program what could rank book size text's words, phrases and
  9sentences continuously on multiple threads and it would be opened to modifing by
 10objects, written in a simple, secure, static language and if it would be very
 11well documented... Now, here it is.
 12
 13FEATURES
 14
 15- Find the most important phrases.
 16- Find the most important words.
 17- Find the most important N sentences.
 18- Importance by phrase weights.
 19- Importance by word occurrence.
 20- Find the first N sentences, start from Xth sentence.
 21- Find sentences by phrase chains ordered by position in text.
 22- Access to the whole ranked data.
 23- Support more languages.
 24- Algorithm for weighting can be modified by interface implementation.
 25- Parser can be modified by interface implementation.
 26- Multi thread support.
 27
 28EXAMPLES
 29
 30Find the most important phrases:
 31
 32This is the most basic and simplest usage of textrank.
 33
 34	package main
 35
 36	import (
 37		"fmt"
 38
 39		"github.com/DavidBelicza/TextRank"
 40	)
 41
 42	func main() {
 43		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
 44		// TextRank object
 45		tr := textrank.NewTextRank()
 46		// Default Rule for parsing.
 47		rule := textrank.NewDefaultRule()
 48		// Default Language for filtering stop words.
 49		language := textrank.NewDefaultLanguage()
 50		// Default algorithm for ranking text.
 51		algorithmDef := textrank.NewDefaultAlgorithm()
 52
 53		// Add text.
 54		tr.Populate(rawText, language, rule)
 55		// Run the ranking.
 56		tr.Ranking(algorithmDef)
 57
 58		// Get all phrases by weight.
 59		rankedPhrases := textrank.FindPhrases(tr)
 60
 61		// Most important phrase.
 62		fmt.Println(rankedPhrases[0])
 63		// Second important phrase.
 64		fmt.Println(rankedPhrases[1])
 65	}
 66
 67All possible pre-defined finder queries:
 68
 69After ranking, the graph contains a lot of valuable data. There are functions in
 70textrank package what contains logic to retrieve those data from the graph.
 71
 72	package main
 73
 74	import (
 75		"fmt"
 76
 77		"github.com/DavidBelicza/TextRank"
 78	)
 79
 80	func main() {
 81		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
 82		// TextRank object
 83		tr := textrank.NewTextRank()
 84		// Default Rule for parsing.
 85		rule := textrank.NewDefaultRule()
 86		// Default Language for filtering stop words.
 87		language := textrank.NewDefaultLanguage()
 88		// Default algorithm for ranking text.
 89		algorithmDef := textrank.NewDefaultAlgorithm()
 90
 91		// Add text.
 92		tr.Populate(rawText, language, rule)
 93		// Run the ranking.
 94		tr.Ranking(algorithmDef)
 95
 96		// Get all phrases order by weight.
 97		rankedPhrases := textrank.FindPhrases(tr)
 98		// Most important phrase.
 99		fmt.Println(rankedPhrases[0])
100
101		// Get all words order by weight.
102		words := textrank.FindSingleWords(tr)
103		// Most important word.
104		fmt.Println(words[0])
105
106		// Get the most important 10 sentences. Importance by phrase weights.
107		sentences := textrank.FindSentencesByRelationWeight(tr, 10)
108		// Found sentences
109		fmt.Println(sentences)
110
111		// Get the most important 10 sentences. Importance by word occurrence.
112		sentences = textrank.FindSentencesByWordQtyWeight(tr, 10)
113		// Found sentences
114		fmt.Println(sentences)
115
116		// Get the first 10 sentences, start from 5th sentence.
117		sentences = textrank.FindSentencesFrom(tr, 5, 10)
118		// Found sentences
119		fmt.Println(sentences)
120
121		// Get sentences by phrase/word chains order by position in text.
122		sentencesPh := textrank.FindSentencesByPhraseChain(tr, []string{"gnome", "shell", "extension"})
123		// Found sentence.
124		fmt.Println(sentencesPh[0])
125	}
126
127Access to everything
128
129After ranking, the graph contains a lot of valuable data. The GetRank function
130allows access to the graph and every data can be retrieved from this structure.
131
132	package main
133
134	import (
135		"fmt"
136
137		"github.com/DavidBelicza/TextRank"
138	)
139
140	func main() {
141		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
142		// TextRank object
143		tr := textrank.NewTextRank()
144		// Default Rule for parsing.
145		rule := textrank.NewDefaultRule()
146		// Default Language for filtering stop words.
147		language := textrank.NewDefaultLanguage()
148		// Default algorithm for ranking text.
149		algorithmDef := textrank.NewDefaultAlgorithm()
150
151		// Add text.
152		tr.Populate(rawText, language, rule)
153		// Run the ranking.
154		tr.Ranking(algorithmDef)
155
156		// Get the rank graph.
157		rankData := tr.GetRankData()
158
159		// Get word ID by token/word.
160		wordId := rankData.WordValID["gnome"]
161
162		// Word's weight.
163		fmt.Println(rankData.Words[wordId].Weight)
164		// Word's quantity/occurrence.
165		fmt.Println(rankData.Words[wordId].Qty)
166		// All sentences what contain the this word.
167		fmt.Println(rankData.Words[wordId].SentenceIDs)
168		// All other words what are related to this word on left side.
169		fmt.Println(rankData.Words[wordId].ConnectionLeft)
170		// All other words what are related to this word on right side.
171		fmt.Println(rankData.Words[wordId].ConnectionRight)
172		// The node of this word, it contains the related words and the
173		// relation weight.
174		fmt.Println(rankData.Relation.Node[wordId])
175	}
176
177Adding text continuously:
178
179It is possibe to add more text after another texts already have been added. The
180Ranking function can merge these multiple texts and it can recalculate the
181weights and all related data.
182
183	package main
184
185	import (
186		"fmt"
187
188		"github.com/DavidBelicza/TextRank"
189	)
190
191	func main() {
192		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
193		// TextRank object
194		tr := textrank.NewTextRank()
195		// Default Rule for parsing.
196		rule := textrank.NewDefaultRule()
197		// Default Language for filtering stop words.
198		language := textrank.NewDefaultLanguage()
199		// Default algorithm for ranking text.
200		algorithmDef := textrank.NewDefaultAlgorithm()
201
202		// Add text.
203		tr.Populate(rawText, language, rule)
204		// Run the ranking.
205		tr.Ranking(algorithmDef)
206
207		rawText2 := "Another book or article..."
208		rawText3 := "Third another book or article..."
209
210		// Add text to the previously added text.
211		tr.Populate(rawText2, language, rule)
212		// Add text to the previously added text.
213		tr.Populate(rawText3, language, rule)
214		// Run the ranking to the whole composed text.
215		tr.Ranking(algorithmDef)
216
217		// Get all phrases by weight.
218		rankedPhrases := textrank.FindPhrases(tr)
219
220		// Most important phrase.
221		fmt.Println(rankedPhrases[0])
222		// Second important phrase.
223		fmt.Println(rankedPhrases[1])
224	}
225
226Using different algorithm to ranking text:
227
228There are two algorithm has implemented, it is possible to write custom
229algorithm by Algorithm interface and use it instead of defaults.
230
231	package main
232
233	import (
234		"fmt"
235
236		"github.com/DavidBelicza/TextRank"
237	)
238
239	func main() {
240		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
241		// TextRank object
242		tr := textrank.NewTextRank()
243		// Default Rule for parsing.
244		rule := textrank.NewDefaultRule()
245		// Default Language for filtering stop words.
246		language := textrank.NewDefaultLanguage()
247		// Using a little bit more complex algorithm to ranking text.
248		algorithmChain := textrank.NewChainAlgorithm()
249
250		// Add text.
251		tr.Populate(rawText, language, rule)
252		// Run the ranking.
253		tr.Ranking(algorithmChain)
254
255		// Get all phrases by weight.
256		rankedPhrases := textrank.FindPhrases(tr)
257
258		// Most important phrase.
259		fmt.Println(rankedPhrases[0])
260		// Second important phrase.
261		fmt.Println(rankedPhrases[1])
262	}
263
264Using multiple graphs:
265
266Graph ID exists because it is possible run multiple independent text ranking
267processes.
268
269	package main
270
271	import (
272		"fmt"
273
274		"github.com/DavidBelicza/TextRank"
275	)
276
277	func main() {
278		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
279		// 1th TextRank object
280		tr1 := textrank.NewTextRank()
281		// Default Rule for parsing.
282		rule := textrank.NewDefaultRule()
283		// Default Language for filtering stop words.
284		language := textrank.NewDefaultLanguage()
285		// Default algorithm for ranking text.
286		algorithmDef := textrank.NewDefaultAlgorithm()
287
288		// Add text.
289		tr1.Populate(rawText, language, rule)
290		// Run the ranking.
291		tr1.Ranking(algorithmDef)
292
293		// 2nd TextRank object
294		tr2 := textrank.NewTextRank()
295
296		// Using a little bit more complex algorithm to ranking text.
297		algorithmChain := textrank.NewChainAlgorithm()
298
299		// Add text to the second graph.
300		tr2.Populate(rawText, language, rule)
301		// Run the ranking on the second graph.
302		tr2.Ranking(algorithmChain)
303
304		// Get all phrases by weight from first graph.
305		rankedPhrases := textrank.FindPhrases(tr1)
306
307		// Most important phrase from first graph.
308		fmt.Println(rankedPhrases[0])
309		// Second important phrase from first graph.
310		fmt.Println(rankedPhrases[1])
311
312		// Get all phrases by weight from second graph.
313		rankedPhrases2 := textrank.FindPhrases(tr2)
314
315		// Most important phrase from second graph.
316		fmt.Println(rankedPhrases2[0])
317		// Second important phrase from second graph.
318		fmt.Println(rankedPhrases2[1])
319	}
320
321Using different non-English languages:
322
323Engish is used by default but it is possible to add any language. To use other
324languages a stop word list is required what you can find here:
325https://github.com/stopwords-iso
326
327	package main
328
329	import (
330		"fmt"
331
332		"github.com/DavidBelicza/TextRank"
333	)
334
335	func main() {
336		rawText := "Your long raw text, it could be a book. Lorem ipsum..."
337		// TextRank object
338		tr := textrank.NewTextRank()
339		// Default Rule for parsing.
340		rule := textrank.NewDefaultRule()
341		// Default Language for filtering stop words.
342		language := textrank.NewDefaultLanguage()
343
344		// Add Spanish stop words (just some example).
345		language.SetWords("es", []string{"uno", "dos", "tres", "yo", "es", "eres"})
346		// Active the Spanish.
347		language.SetActiveLanguage("es")
348
349		// Default algorithm for ranking text.
350		algorithmDef := textrank.NewDefaultAlgorithm()
351
352		// Add text.
353		tr.Populate(rawText, language, rule)
354		// Run the ranking.
355		tr.Ranking(algorithmDef)
356
357		// Get all phrases by weight.
358		rankedPhrases := textrank.FindPhrases(tr)
359
360		// Most important phrase.
361		fmt.Println(rankedPhrases[0])
362		// Second important phrase.
363		fmt.Println(rankedPhrases[1])
364	}
365
366Asynchronous usage by goroutines:
367
368It is thread safe. Independent graphs can receive texts in the same time and can
369be extended by more text also in the same time.
370
371	package main
372
373	import (
374		"fmt"
375		"time"
376
377		"github.com/DavidBelicza/TextRank"
378	)
379
380	func main() {
381		// A flag when program has to stop.
382		stopProgram := false
383		// Channel.
384		stream := make(chan string)
385		// TextRank object.
386		tr := textrank.NewTextRank()
387
388		// Open new thread/routine
389		go func(tr *textrank.TextRank) {
390			// 3 texts.
391			rawTexts := []string{
392				"Very long text...",
393				"Another very long text...",
394				"Second another very long text...",
395			}
396
397			// Add 3 texts to the stream channel, one by one.
398			for _, rawText := range rawTexts {
399				stream <- rawText
400			}
401		}(tr)
402
403		// Open new thread/routine
404		go func() {
405			// Counter how many times texts added to the ranking.
406			i := 1
407
408			for {
409				// Get text from stream channel when it got a new one.
410				rawText := <-stream
411
412				// Default Rule for parsing.
413				rule := textrank.NewDefaultRule()
414				// Default Language for filtering stop words.
415				language := textrank.NewDefaultLanguage()
416				// Default algorithm for ranking text.
417				algorithm := textrank.NewDefaultAlgorithm()
418
419				// Add text.
420				tr.Populate(rawText, language, rule)
421				// Run the ranking.
422				tr.Ranking(algorithm)
423
424				// Set stopProgram flag to true when all 3 text have been added.
425				if i == 3 {
426					stopProgram = true
427				}
428
429				i++
430			}
431		}()
432
433		// The main thread has to run while go-routines run. When stopProgram is
434		// true then the loop has finish.
435		for !stopProgram {
436			time.Sleep(time.Second * 1)
437		}
438
439		// Most important phrase.
440		phrases := textrank.FindPhrases(tr)
441		// Second important phrase.
442		fmt.Println(phrases[0])
443	}
444*/
445package textrank