diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
| commit | c6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch) | |
| tree | 36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/DavidBelicza/TextRank | |
| parent | 0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff) | |
| download | jbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz | |
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank')
19 files changed, 2428 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/.gitignore b/vendor/github.com/DavidBelicza/TextRank/v2/.gitignore new file mode 100644 index 0000000..f83ccd4 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/.gitignore @@ -0,0 +1,6 @@ +/.vscode +/.idea +/pkg +/bin +/install.sh +/vendor diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml b/vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml new file mode 100644 index 0000000..899b6a3 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/.travis.yml @@ -0,0 +1,10 @@ +language: go +sudo: false + +matrix: + include: + - go: "1.15" + +script: + - go mod vendor + - go test ./... diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile b/vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile new file mode 100644 index 0000000..d31839a --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/Dockerfile @@ -0,0 +1,9 @@ +FROM golang:1.15 +MAINTAINER David Belicza + +ADD ./ /go/src/github.com/DavidBelicza/TextRank + +WORKDIR /go/src/github.com/DavidBelicza/TextRank + +CMD go mod vendor +CMD /bin/bash diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/LICENSE b/vendor/github.com/DavidBelicza/TextRank/v2/LICENSE new file mode 100644 index 0000000..960d66b --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright 2018 David Belicza + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/README.md b/vendor/github.com/DavidBelicza/TextRank/v2/README.md new file mode 100644 index 0000000..7b17b64 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/README.md @@ -0,0 +1,543 @@ +<h1 align="center"> +TextRank on Go +</h1> + +<p align="center"> + <a href="https://godoc.org/github.com/DavidBelicza/TextRank"> + <img src="https://godoc.org/github.com/DavidBelicza/TextRank?status.svg" alt="GoDoc" /> + </a> + <a href="https://github.com/DavidBelicza/TextRank/blob/master/LICENSE"> + <img src="https://img.shields.io/badge/License-MIT-ee00ee.svg" alt="License: MIT" /> + </a> + <a href="https://travis-ci.org/DavidBelicza/TextRank"> + <img src="https://travis-ci.org/DavidBelicza/TextRank.svg?branch=master" alt="Build Status" /> + </a> + <a href="https://goreportcard.com/report/github.com/DavidBelicza/TextRank"> + <img src="https://goreportcard.com/badge/github.com/DavidBelicza/TextRank" alt="Go Report Card" /> + </a> + <a href="https://coveralls.io/github/DavidBelicza/TextRank?branch=master"> + <img src="https://coveralls.io/repos/github/DavidBelicza/TextRank/badge.svg?branch=master" alt="Coverage Status" /> + </a> + <a href="https://github.com/DavidBelicza/TextRank/releases/latest"> + <img src="https://img.shields.io/github/release/DavidBelicza/TextRank.svg?colorB=269aca" alt="Release" /> + </a> + +</p> + +<p align="center"> +This source code is an implementation of textrank algorithm, under MIT licence. +<br />The minimum requred Go version is 1.8. +<p align="center"> +<br /> + +## MOTIVATION + +If there was a program what could rank book size text's words, phrases and sentences continuously on multiple threads and it would be opened to modifing by objects, written in a simple, secure, static language and if it would be very well documented... Now, here it is. + +## FEATURES + +* Find the most important phrases. +* Find the most important words. +* Find the most important N sentences. + * Importance by phrase weights. + * Importance by word occurrence. +* Find the first N sentences, start from Xth sentence. +* Find sentences by phrase chains ordered by position in text. +* Access to the whole ranked data. +* Support more languages. +* Algorithm for weighting can be modified by interface implementation. +* Parser can be modified by interface implementation. +* Multi thread support. + +## INSTALL + +You can install TextRank by Go's get: + +```go get github.com/DavidBelicza/TextRank``` + +TextRank uses the default Go *mod* as vendoring tool, so you can install the dependencies with this command: + +```go mod vendor``` + +## DOCKER + +Using Docker to TextRank isn't necessary, it's just an option. + +Build image from the repository's root directory: + +```docker build -t go_text_rank_image .``` + +Create container from the image: + +```docker run -dit --name textrank go_text_rank_image:latest``` + +Run the **go test -v .** code inside the container: + +```docker exec -i -t textrank go test -v .``` + +Stop, start or remove the container: + +* ```docker stop textrank``` +* ```docker start textrank``` +* ```docker rm textrank``` + +## HOW DOES IT WORK + +Too see how does it work, the easiest way is to use the sample text. Sample text can be found in the [textrank_test.go file at this line](https://github.com/DavidBelicza/TextRank/blob/master/textrank_test.go#L12). It's a short size text about Gnome Shell. + +* TextRank reads the text, + * parse it, + * remove the unnecessary stop words, + * tokenize it +* and counting the occurrence of the words and phrases +* and then it starts weighting + * by the occurrence of words and phrases and their relations. +* After weights are done, TextRank normalize weights to between 1 and 0. +* Then the different finder methods capable to find the most important words, phrases or sentences. + +The most important phrases from the sample text are: + +Phrase | Occurrence | Weight +--- | --- | --- +gnome - shell | 5 | 1 +extension - gnome | 3 | 0.50859946 +icons - tray | 3 | 0.49631447 +gnome - caffeine | 2 | 0.27027023 + +The **gnome** is the most often used word in this text and **shell** is also used multiple times. Two of them are used together as a phrase 5 times. This is the highest occurrence in this text, so this is the most important phrase. + +The following two important phrases have same occurrence 3, however they are not equal. This is because the **extension gnome** phrase contains the word **gnome**, the most popular word in the text, and it increases the phrase's weight. It increases the weight of any word what is related to it, but not too much to overcome other important phrases what don't contain the **gnome** word. + +The exact algorithm can be found in the [algorithm.go file at this line](https://github.com/DavidBelicza/TextRank/blob/master/rank/algorithm.go#L65). + +## TEXTRANK OR AUTOMATIC SUMMARIZATION +> Automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document. Technologies that can make a coherent summary take into account variables such as length, writing style and syntax. Automatic data summarization is part of machine learning and data mining. The main idea of summarization is to find a representative subset of the data, which contains the information of the entire set. Summarization technologies are used in a large number of sectors in industry today. - Wikipedia + +## EXAMPLES + +### Find the most important phrases + +This is the most basic and simplest usage of textrank. + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) +} +``` + +### All possible pre-defined finder queries + +After ranking, the graph contains a lot of valuable data. There are functions in textrank package what contains logic to retrieve those data from the graph. + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get all phrases order by weight. + rankedPhrases := textrank.FindPhrases(tr) + // Most important phrase. + fmt.Println(rankedPhrases[0]) + + // Get all words order by weight. + words := textrank.FindSingleWords(tr) + // Most important word. + fmt.Println(words[0]) + + // Get the most important 10 sentences. Importance by phrase weights. + sentences := textrank.FindSentencesByRelationWeight(tr, 10) + // Found sentences + fmt.Println(sentences) + + // Get the most important 10 sentences. Importance by word occurrence. + sentences = textrank.FindSentencesByWordQtyWeight(tr, 10) + // Found sentences + fmt.Println(sentences) + + // Get the first 10 sentences, start from 5th sentence. + sentences = textrank.FindSentencesFrom(tr, 5, 10) + // Found sentences + fmt.Println(sentences) + + // Get sentences by phrase/word chains order by position in text. + sentencesPh := textrank.FindSentencesByPhraseChain(tr, []string{"gnome", "shell", "extension"}) + // Found sentence. + fmt.Println(sentencesPh[0]) +} +``` + +### Access to everything + +After ranking, the graph contains a lot of valuable data. The GetRank function allows access to the graph and every data can be retrieved from this structure. + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get the rank graph. + rankData := tr.GetRankData() + + // Get word ID by token/word. + wordId := rankData.WordValID["gnome"] + + // Word's weight. + fmt.Println(rankData.Words[wordId].Weight) + // Word's quantity/occurrence. + fmt.Println(rankData.Words[wordId].Qty) + // All sentences what contain the this word. + fmt.Println(rankData.Words[wordId].SentenceIDs) + // All other words what are related to this word on left side. + fmt.Println(rankData.Words[wordId].ConnectionLeft) + // All other words what are related to this word on right side. + fmt.Println(rankData.Words[wordId].ConnectionRight) + // The node of this word, it contains the related words and the relation weight. + fmt.Println(rankData.Relation.Node[wordId]) +} +``` + +### Adding text continuously + +It is possibe to add more text after another texts already have been added. The Ranking function can merge these multiple texts and it can recalculate the weights and all related data. + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + rawText2 := "Another book or article..." + rawText3 := "Third another book or article..." + + // Add text to the previously added text. + tr.Populate(rawText2, language, rule) + // Add text to the previously added text. + tr.Populate(rawText3, language, rule) + // Run the ranking to the whole composed text. + tr.Ranking(algorithmDef) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) +} +``` + +### Using different algorithm to ranking text + +There are two algorithm has implemented, it is possible to write custom algorithm by Algorithm interface and use it instead of defaults. + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Using a little bit more complex algorithm to ranking text. + algorithmChain := textrank.NewChainAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmChain) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) +} +``` + +### Using multiple graphs + +Graph ID exists because it is possible run multiple independent text ranking processes. + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // 1th TextRank object + tr1 := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr1.Populate(rawText, language, rule) + // Run the ranking. + tr1.Ranking(algorithmDef) + + // 2nd TextRank object + tr2 := textrank.NewTextRank() + + // Using a little bit more complex algorithm to ranking text. + algorithmChain := textrank.NewChainAlgorithm() + + // Add text to the second graph. + tr2.Populate(rawText, language, rule) + // Run the ranking on the second graph. + tr2.Ranking(algorithmChain) + + // Get all phrases by weight from first graph. + rankedPhrases := textrank.FindPhrases(tr1) + + // Most important phrase from first graph. + fmt.Println(rankedPhrases[0]) + // Second important phrase from first graph. + fmt.Println(rankedPhrases[1]) + + // Get all phrases by weight from second graph. + rankedPhrases2 := textrank.FindPhrases(tr2) + + // Most important phrase from second graph. + fmt.Println(rankedPhrases2[0]) + // Second important phrase from second graph. + fmt.Println(rankedPhrases2[1]) +} +``` + +### Using different non-English languages + +Engish is used by default but it is possible to add any language. To use other languages a stop word list is required what you can find here: https://github.com/stopwords-iso + +```go +package main + +import ( + "fmt" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + + // Add Spanish stop words (just some example). + language.SetWords("es", []string{"uno", "dos", "tres", "yo", "es", "eres"}) + // Active the Spanish. + language.SetActiveLanguage("es") + + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) +} +``` + +### Asynchronous usage by goroutines + +It is thread safe. Independent graphs can receive texts in the same time and can be extended by more text also in the same time. + +```go +package main + +import ( + "fmt" + "time" + + "github.com/DavidBelicza/TextRank/v2" +) + +func main() { + // A flag when program has to stop. + stopProgram := false + // Channel. + stream := make(chan string) + // TextRank object. + tr := textrank.NewTextRank() + + // Open new thread/routine + go func(tr *textrank.TextRank) { + // 3 texts. + rawTexts := []string{ + "Very long text...", + "Another very long text...", + "Second another very long text...", + } + + // Add 3 texts to the stream channel, one by one. + for _, rawText := range rawTexts { + stream <- rawText + } + }(tr) + + // Open new thread/routine + go func() { + // Counter how many times texts added to the ranking. + i := 1 + + for { + // Get text from stream channel when it got a new one. + rawText := <-stream + + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithm := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithm) + + // Set stopProgram flag to true when all 3 text have been added. + if i == 3 { + stopProgram = true + } + + i++ + } + }() + + // The main thread has to run while go-routines run. When stopProgram is + // true then the loop has finish. + for !stopProgram { + time.Sleep(time.Second * 1) + } + + // Most important phrase. + phrases := textrank.FindPhrases(tr) + // Second important phrase. + fmt.Println(phrases[0]) +} +``` + +## A SIMPLE VISUAL REPRESENTATION + +The below image is a representation how works the simplest text ranking algorithm. This algorithm can be replaced by an another one by inject different Algorithm interface implementation. + +<img src="https://i.imgur.com/RUdDfBz.jpg" /> diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go b/vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go new file mode 100644 index 0000000..db94cfc --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/convert/builder.go @@ -0,0 +1,43 @@ +package convert + +import ( + "github.com/DavidBelicza/TextRank/v2/parse" + "github.com/DavidBelicza/TextRank/v2/rank" +) + +// TextToRank function converts a ParsedSentence object to Rank object, it is +// the preparing process to later text ranking. +func TextToRank(sentence parse.ParsedSentence, lang Language, ranks *rank.Rank) { + sentenceId := addSentence(ranks, sentence) + addWord(ranks, sentence.GetWords(), lang, sentenceId) +} + +func addWord(ranks *rank.Rank, words []string, lang Language, sentenceID int) { + prevWordID := -1 + var curWordID int + + for _, word := range words { + if !lang.IsStopWord(word) { + if found, rootWord := lang.FindRootWord(word); found { + word = rootWord + } + + if !ranks.IsWordExist(word) { + curWordID = ranks.AddNewWord(word, prevWordID, sentenceID) + } else { + curWordID = ranks.UpdateWord(word, prevWordID, sentenceID) + } + + ranks.Relation.AddRelation(curWordID, prevWordID, sentenceID) + ranks.UpdateRightConnection(prevWordID, curWordID) + + prevWordID = curWordID + } + } +} + +func addSentence(ranks *rank.Rank, sentence parse.ParsedSentence) int { + ranks.SentenceMap[len(ranks.SentenceMap)] = sentence.GetOriginal() + + return len(ranks.SentenceMap) - 1 +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go b/vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go new file mode 100644 index 0000000..fdad698 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/convert/language.go @@ -0,0 +1,71 @@ +package convert + +import "unicode/utf8" + +// Language interface and its methods make possible the polimorf usage of +// language specific features by custom implementations. +type Language interface { + IsStopWord(word string) bool + FindRootWord(word string) (bool, string) + SetActiveLanguage(code string) + SetWords(code string, words []string) +} + +// LanguageDefault struct is implementation of Language interface. It stores +// the stop words of loaded languages and can find stop words by tokens. +type LanguageDefault struct { + defaultLang string + languages map[string][]string +} + +// NewLanguage constructor of the LanguageDefault Retrieves a pointer +// LanguageDefault. It has setup to English by default. +func NewLanguage() *LanguageDefault { + lang := &LanguageDefault{ + "en", + make(map[string][]string), + } + + words := getDefaultEnglish() + + lang.SetWords("en", words) + + return lang +} + +// IsStopWord method retrieves true when the given word is in the stop word +// list or when the word has less character then 2. +func (lang *LanguageDefault) IsStopWord(word string) bool { + if utf8.RuneCountInString(word) <= 2 { + return true + } + + if stopWords, ok := lang.languages[lang.defaultLang]; ok { + for _, val := range stopWords { + if val == word { + return true + } + } + } + + return false +} + +// FindRootWord method gets a word as an input, "apples" for example and it +// retrieves the root-word of this given word, "apple" for example. The first +// return parameter is true when a word-root has found, otherwise it's false. +func (lang *LanguageDefault) FindRootWord(word string) (bool, string) { + return false, "" +} + +// SetActiveLanguage method switch between languages by the language's code. The +// language code is not standard, it can be anything. +func (lang *LanguageDefault) SetActiveLanguage(code string) { + lang.defaultLang = code +} + +// SetWords method set stop words into the LanguageDefault struct by the +// language's code. +func (lang *LanguageDefault) SetWords(code string, words []string) { + lang.languages[code] = words +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go b/vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go new file mode 100644 index 0000000..8977a2d --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/convert/stop_word.go @@ -0,0 +1,332 @@ +package convert + +func getDefaultEnglish() []string { + + return []string{ + "a", + "about", + "above", + "above", + "across", + "after", + "afterwards", + "again", + "against", + "all", + "almost", + "alone", + "along", + "already", + "also", + "although", + "always", + "am", + "among", + "amongst", + "amount", + "an", + "and", + "another", + "any", + "anyhow", + "anyone", + "anything", + "anyway", + "anywhere", + "are", + "around", + "as", + "at", + "back", + "be", + "became", + "because", + "become", + "becomes", + "becoming", + "been", + "before", + "beforehand", + "behind", + "being", + "below", + "beside", + "besides", + "between", + "beyond", + "bill", + "both", + "bottom", + "but", + "by", + "call", + "can", + "cannot", + "cant", + "co", + "con", + "could", + "couldn't", + "cry", + "de", + "describe", + "detail", + "did", + "didn't", + "do", + "does", + "doesn't", + "done", + "don't", + "down", + "due", + "during", + "each", + "eg", + "eight", + "either", + "eleven", + "else", + "elsewhere", + "empty", + "enough", + "etc", + "even", + "ever", + "every", + "everyone", + "everything", + "everywhere", + "except", + "few", + "fifteen", + "fify", + "fill", + "find", + "fire", + "first", + "five", + "for", + "former", + "formerly", + "forty", + "found", + "four", + "from", + "front", + "full", + "further", + "get", + "give", + "go", + "had", + "has", + "hasnt", + "have", + "he", + "hence", + "her", + "here", + "hereafter", + "hereby", + "herein", + "hereupon", + "hers", + "herself", + "him", + "himself", + "his", + "how", + "however", + "hundred", + "i", + "ie", + "if", + "in", + "inc", + "indeed", + "interest", + "into", + "is", + "it", + "its", + "itself", + "keep", + "last", + "latter", + "latterly", + "least", + "less", + "ltd", + "made", + "many", + "may", + "me", + "meanwhile", + "might", + "mill", + "mine", + "more", + "moreover", + "most", + "mostly", + "move", + "much", + "must", + "my", + "myself", + "name", + "namely", + "neither", + "never", + "nevertheless", + "next", + "nine", + "no", + "nobody", + "none", + "noone", + "nor", + "not", + "nothing", + "now", + "nowhere", + "of", + "off", + "often", + "oh", + "on", + "once", + "one", + "only", + "onto", + "or", + "other", + "others", + "otherwise", + "our", + "ours", + "ourselves", + "out", + "over", + "own", + "part", + "per", + "perhaps", + "please", + "put", + "rather", + "re", + "same", + "see", + "seem", + "seemed", + "seeming", + "seems", + "serious", + "several", + "she", + "should", + "show", + "side", + "since", + "sincere", + "six", + "sixty", + "so", + "some", + "somehow", + "someone", + "something", + "sometime", + "sometimes", + "somewhere", + "still", + "such", + "system", + "take", + "ten", + "than", + "that", + "the", + "their", + "them", + "themselves", + "then", + "thence", + "there", + "thereafter", + "thereby", + "therefore", + "therein", + "thereupon", + "these", + "they", + "thickv", + "thin", + "third", + "this", + "those", + "though", + "three", + "through", + "throughout", + "thru", + "thus", + "to", + "together", + "too", + "top", + "toward", + "towards", + "twelve", + "twenty", + "two", + "un", + "under", + "until", + "up", + "upon", + "us", + "very", + "via", + "was", + "we", + "well", + "were", + "what", + "whatever", + "when", + "whence", + "whenever", + "where", + "whereafter", + "whereas", + "whereby", + "wherein", + "whereupon", + "wherever", + "whether", + "which", + "while", + "whither", + "who", + "whoever", + "whole", + "whom", + "whose", + "why", + "will", + "with", + "within", + "without", + "would", + "yes", + "yet", + "you", + "your", + "yours", + "yourself", + "yourselves", + } +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/doc.go b/vendor/github.com/DavidBelicza/TextRank/v2/doc.go new file mode 100644 index 0000000..51c8cc6 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/doc.go @@ -0,0 +1,445 @@ +/* +Package textrank is an implementation of Text Rank algorithm in Go with +extendable features (automatic summarization, phrase extraction). It supports +multithreading by goroutines. The package is under The MIT Licence. + +MOTIVATION + +If there was a program what could rank book size text's words, phrases and +sentences continuously on multiple threads and it would be opened to modifing by +objects, written in a simple, secure, static language and if it would be very +well documented... Now, here it is. + +FEATURES + +- Find the most important phrases. +- Find the most important words. +- Find the most important N sentences. +- Importance by phrase weights. +- Importance by word occurrence. +- Find the first N sentences, start from Xth sentence. +- Find sentences by phrase chains ordered by position in text. +- Access to the whole ranked data. +- Support more languages. +- Algorithm for weighting can be modified by interface implementation. +- Parser can be modified by interface implementation. +- Multi thread support. + +EXAMPLES + +Find the most important phrases: + +This is the most basic and simplest usage of textrank. + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) + } + +All possible pre-defined finder queries: + +After ranking, the graph contains a lot of valuable data. There are functions in +textrank package what contains logic to retrieve those data from the graph. + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get all phrases order by weight. + rankedPhrases := textrank.FindPhrases(tr) + // Most important phrase. + fmt.Println(rankedPhrases[0]) + + // Get all words order by weight. + words := textrank.FindSingleWords(tr) + // Most important word. + fmt.Println(words[0]) + + // Get the most important 10 sentences. Importance by phrase weights. + sentences := textrank.FindSentencesByRelationWeight(tr, 10) + // Found sentences + fmt.Println(sentences) + + // Get the most important 10 sentences. Importance by word occurrence. + sentences = textrank.FindSentencesByWordQtyWeight(tr, 10) + // Found sentences + fmt.Println(sentences) + + // Get the first 10 sentences, start from 5th sentence. + sentences = textrank.FindSentencesFrom(tr, 5, 10) + // Found sentences + fmt.Println(sentences) + + // Get sentences by phrase/word chains order by position in text. + sentencesPh := textrank.FindSentencesByPhraseChain(tr, []string{"gnome", "shell", "extension"}) + // Found sentence. + fmt.Println(sentencesPh[0]) + } + +Access to everything + +After ranking, the graph contains a lot of valuable data. The GetRank function +allows access to the graph and every data can be retrieved from this structure. + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get the rank graph. + rankData := tr.GetRankData() + + // Get word ID by token/word. + wordId := rankData.WordValID["gnome"] + + // Word's weight. + fmt.Println(rankData.Words[wordId].Weight) + // Word's quantity/occurrence. + fmt.Println(rankData.Words[wordId].Qty) + // All sentences what contain the this word. + fmt.Println(rankData.Words[wordId].SentenceIDs) + // All other words what are related to this word on left side. + fmt.Println(rankData.Words[wordId].ConnectionLeft) + // All other words what are related to this word on right side. + fmt.Println(rankData.Words[wordId].ConnectionRight) + // The node of this word, it contains the related words and the + // relation weight. + fmt.Println(rankData.Relation.Node[wordId]) + } + +Adding text continuously: + +It is possibe to add more text after another texts already have been added. The +Ranking function can merge these multiple texts and it can recalculate the +weights and all related data. + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + rawText2 := "Another book or article..." + rawText3 := "Third another book or article..." + + // Add text to the previously added text. + tr.Populate(rawText2, language, rule) + // Add text to the previously added text. + tr.Populate(rawText3, language, rule) + // Run the ranking to the whole composed text. + tr.Ranking(algorithmDef) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) + } + +Using different algorithm to ranking text: + +There are two algorithm has implemented, it is possible to write custom +algorithm by Algorithm interface and use it instead of defaults. + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Using a little bit more complex algorithm to ranking text. + algorithmChain := textrank.NewChainAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmChain) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) + } + +Using multiple graphs: + +Graph ID exists because it is possible run multiple independent text ranking +processes. + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // 1th TextRank object + tr1 := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr1.Populate(rawText, language, rule) + // Run the ranking. + tr1.Ranking(algorithmDef) + + // 2nd TextRank object + tr2 := textrank.NewTextRank() + + // Using a little bit more complex algorithm to ranking text. + algorithmChain := textrank.NewChainAlgorithm() + + // Add text to the second graph. + tr2.Populate(rawText, language, rule) + // Run the ranking on the second graph. + tr2.Ranking(algorithmChain) + + // Get all phrases by weight from first graph. + rankedPhrases := textrank.FindPhrases(tr1) + + // Most important phrase from first graph. + fmt.Println(rankedPhrases[0]) + // Second important phrase from first graph. + fmt.Println(rankedPhrases[1]) + + // Get all phrases by weight from second graph. + rankedPhrases2 := textrank.FindPhrases(tr2) + + // Most important phrase from second graph. + fmt.Println(rankedPhrases2[0]) + // Second important phrase from second graph. + fmt.Println(rankedPhrases2[1]) + } + +Using different non-English languages: + +Engish is used by default but it is possible to add any language. To use other +languages a stop word list is required what you can find here: +https://github.com/stopwords-iso + + package main + + import ( + "fmt" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + rawText := "Your long raw text, it could be a book. Lorem ipsum..." + // TextRank object + tr := textrank.NewTextRank() + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + + // Add Spanish stop words (just some example). + language.SetWords("es", []string{"uno", "dos", "tres", "yo", "es", "eres"}) + // Active the Spanish. + language.SetActiveLanguage("es") + + // Default algorithm for ranking text. + algorithmDef := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithmDef) + + // Get all phrases by weight. + rankedPhrases := textrank.FindPhrases(tr) + + // Most important phrase. + fmt.Println(rankedPhrases[0]) + // Second important phrase. + fmt.Println(rankedPhrases[1]) + } + +Asynchronous usage by goroutines: + +It is thread safe. Independent graphs can receive texts in the same time and can +be extended by more text also in the same time. + + package main + + import ( + "fmt" + "time" + + "github.com/DavidBelicza/TextRank" + ) + + func main() { + // A flag when program has to stop. + stopProgram := false + // Channel. + stream := make(chan string) + // TextRank object. + tr := textrank.NewTextRank() + + // Open new thread/routine + go func(tr *textrank.TextRank) { + // 3 texts. + rawTexts := []string{ + "Very long text...", + "Another very long text...", + "Second another very long text...", + } + + // Add 3 texts to the stream channel, one by one. + for _, rawText := range rawTexts { + stream <- rawText + } + }(tr) + + // Open new thread/routine + go func() { + // Counter how many times texts added to the ranking. + i := 1 + + for { + // Get text from stream channel when it got a new one. + rawText := <-stream + + // Default Rule for parsing. + rule := textrank.NewDefaultRule() + // Default Language for filtering stop words. + language := textrank.NewDefaultLanguage() + // Default algorithm for ranking text. + algorithm := textrank.NewDefaultAlgorithm() + + // Add text. + tr.Populate(rawText, language, rule) + // Run the ranking. + tr.Ranking(algorithm) + + // Set stopProgram flag to true when all 3 text have been added. + if i == 3 { + stopProgram = true + } + + i++ + } + }() + + // The main thread has to run while go-routines run. When stopProgram is + // true then the loop has finish. + for !stopProgram { + time.Sleep(time.Second * 1) + } + + // Most important phrase. + phrases := textrank.FindPhrases(tr) + // Second important phrase. + fmt.Println(phrases[0]) + } +*/ +package textrank diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh b/vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh new file mode 100644 index 0000000..84e2d1b --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/install.example.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +go mod vendor +go test ./... diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go new file mode 100644 index 0000000..0f6ec91 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/rule.go @@ -0,0 +1,52 @@ +package parse + +// Rule interface and its methods make possible the polimorf usage of process +// how Rule retrieve tokens from text. +type Rule interface { + IsWordSeparator(rune rune) bool + IsSentenceSeparator(rune rune) bool +} + +// RuleDefault struct implements the Rule interface. It contains the separator +// characters and can decide a character is separator or not. +type RuleDefault struct { + wordSeparators [21]string + sentenceSeparators [3]string +} + +// NewRule constructor retrieves a RuleDefault pointer. +func NewRule() *RuleDefault { + return &RuleDefault{ + [21]string{" ", ",", "'", "’", "\"", ")", "(", "[", "]", "{", "}", "\"", ";", "\n", ">", "<", "%", "@", "&", "=", "#"}, + [3]string{"!", ".", "?"}, + } +} + +// IsWordSeparator method retrieves true when a character is a kind of special +// character and possibly it separates to words from each other. It also checks +// for sentence separator by IsSentenceSeparator method. +func (r *RuleDefault) IsWordSeparator(rune rune) bool { + chr := string(rune) + + for _, val := range r.wordSeparators { + if chr == val { + return true + } + } + + return r.IsSentenceSeparator(rune) +} + +// IsSentenceSeparator method retrieves true when a character is a kind of +// special character and possibly it separates to words from each other. +func (r *RuleDefault) IsSentenceSeparator(rune rune) bool { + chr := string(rune) + + for _, val := range r.sentenceSeparators { + if chr == val { + return true + } + } + + return false +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go new file mode 100644 index 0000000..aab27c3 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/text.go @@ -0,0 +1,44 @@ +package parse + +// Text struct contains a parsed text. +type Text struct { + parsedSentences []ParsedSentence +} + +// ParsedSentence struct contains the original raw sentences and their words. +type ParsedSentence struct { + original string + words []string +} + +// Append method creates a sentence and its words and append them to the Text +// object. +func (text *Text) Append(rawSentence string, words []string) { + if len(words) > 0 { + parsedSentence := ParsedSentence{ + original: rawSentence, + words: words, + } + + text.parsedSentences = append( + text.parsedSentences, + parsedSentence, + ) + } +} + +// GetSentences method returns ParsedSentence slice from Text struct. +func (text *Text) GetSentences() []ParsedSentence { + return text.parsedSentences +} + +// GetWords methods returns the words string slice of ParsedSentence struct. +func (parsedSentence *ParsedSentence) GetWords() []string { + return parsedSentence.words +} + +// GetOriginal method returns the original sentence as a string from a +// ParsedSentence struct. +func (parsedSentence *ParsedSentence) GetOriginal() string { + return parsedSentence.original +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go new file mode 100644 index 0000000..003460e --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/parse/tokenizer.go @@ -0,0 +1,63 @@ +package parse + +import ( + "strings" +) + +// TokenizeText function use the given raw text and parses by a Rule object and +// retrieves the parsed text in a Text struct object. +func TokenizeText(rawText string, rule Rule) Text { + return findSentences(rawText, rule) +} + +func findSentences(rawText string, rule Rule) Text { + text := Text{} + + var sentence string + var i int + slen := len(rawText) + + for j, chr := range rawText { + j += len(string(chr)) + //when separator or the last + if rule.IsSentenceSeparator(chr) || j == slen { + sentence = rawText[i:j] + if len(sentence) > 0 { + text.Append(sentence, findWords(sentence, rule)) + } + + sentence = "" + i = j + } + } + + return text +} + +func findWords(rawSentence string, rule Rule) (words []string) { + words = []string{} + + var word string + var i int + slen := len(rawSentence) + + for j, chr := range rawSentence { + chrlen := len(string(chr)) + j += chrlen + //when separator or the last + if sep := rule.IsWordSeparator(chr); sep || j == slen { + if sep { + word = rawSentence[i : j-chrlen] + } else { + word = rawSentence[i:j] + } + if len(word) > 0 { + words = append(words, strings.ToLower(word)) + } + word = "" + i = j + } + } + + return +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go new file mode 100644 index 0000000..8f9345f --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go @@ -0,0 +1,99 @@ +package rank + +import ( + "math" +) + +// Algorithm interface and its methods make possible the polimorf usage of +// weighting process. +type Algorithm interface { + WeightingRelation( + word1ID int, + word2ID int, + rank *Rank, + ) float32 + + WeightingHits( + wordID int, + rank *Rank, + ) float32 +} + +// AlgorithmDefault struct is the basic implementation of Algorithm. It can +// weight a word or phrase by comparing them. +type AlgorithmDefault struct{} + +// NewAlgorithmDefault constructor retrieves an AlgorithmDefault pointer. +func NewAlgorithmDefault() *AlgorithmDefault { + return &AlgorithmDefault{} +} + +// WeightingRelation method is the traditional algorithm of text rank to +// weighting a phrase. +func (a *AlgorithmDefault) WeightingRelation( + word1ID int, + word2ID int, + rank *Rank, +) float32 { + relationQty := rank.Relation.Node[word1ID][word2ID].Qty + + return float32(relationQty) +} + +// WeightingHits method ranks the words by their occurrence. +func (a *AlgorithmDefault) WeightingHits( + wordID int, + rank *Rank, +) float32 { + weight := rank.Words[wordID].Qty + + return float32(weight) +} + +// AlgorithmChain struct is the combined implementation of Algorithm. It is a +// good example how weighting can be changed by a different implementations. It +// can weight a word or phrase by comparing them. +type AlgorithmChain struct{} + +// NewAlgorithmChain constructor retrieves an AlgorithmChain pointer. +func NewAlgorithmChain() *AlgorithmChain { + return &AlgorithmChain{} +} + +// WeightingRelation method is a combined algorithm of text rank and word +// occurrence, it weights a phrase. +func (a *AlgorithmChain) WeightingRelation( + word1ID int, + word2ID int, + rank *Rank, +) float32 { + relationQty := rank.Relation.Node[word1ID][word2ID].Qty + word1Qty := rank.Words[word1ID].Qty + word2Qty := rank.Words[word2ID].Qty + + qDiff := float32(math.Abs(float64(word1Qty)-float64(word2Qty))) / 100 + weight := float32(relationQty) + qDiff + + return weight +} + +// WeightingHits method ranks the words by their occurrence. +func (a *AlgorithmChain) WeightingHits( + wordID int, + rank *Rank, +) float32 { + word := rank.Words[wordID] + qty := 0 + + for leftWordID, leftWordQty := range word.ConnectionLeft { + qty += rank.Words[leftWordID].Qty * leftWordQty + } + + for rightWordID, rightWordQty := range word.ConnectionRight { + qty += rank.Words[rightWordID].Qty * rightWordQty + } + + weight := float32(word.Qty) + (float32(qty)) + + return float32(weight) +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go new file mode 100644 index 0000000..3bcef7c --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go @@ -0,0 +1,147 @@ +package rank + +// Rank struct contains every original raw sentences, words, tokens, phrases, +// indexes, word hits, phrase hits and minimum-maximum values. +// +// Max is the occurrence of the most used word. +// +// Min is the occurrence of the less used word. It is always greater then 0. +// +// Relation is the Relation object, contains phrases. +// +// SentenceMap contains raw sentences. Index is the sentence ID, value is the +// sentence itself. +// +// Words contains Word objects. Index is the word ID, value is the word/token +// itself. +// +// WordValID contains words. Index is the word/token, value is the ID. +type Rank struct { + Max float32 + Min float32 + Relation Relation + SentenceMap map[int]string + Words map[int]*Word + WordValID map[string]int +} + +// Word struct contains all data about the words. +// +// If a word is multiple times in the text then the multiple words point to the +// same ID. So Word is unique. +// +// SentenceIDs contains all IDs of sentences what contain the word. +// +// ConnectionLeft contains all words what are connected to this word on the left +// side. The map index is the ID of the related word and its value is the +// occurrence. +// +// ConnectionRight contains all words what are connected to this word on the +// right side. The map index is the ID of the related word and its value is the +// occurrence. +// +// Token is the word itself, but not the original, it is tokenized. +// +// Qty is the number of occurrence of the word. +// +// Weight is the weight of the word between 0.00 and 1.00. +type Word struct { + ID int + SentenceIDs []int + ConnectionLeft map[int]int + ConnectionRight map[int]int + Token string + Qty int + Weight float32 +} + +// NewRank constructor retrieves a Rank pointer. +func NewRank() *Rank { + return &Rank{ + 0, + 0, + Relation{ + 0, + 0, + make(map[int]map[int]Score), + }, + make(map[int]string), + make(map[int]*Word), + make(map[string]int), + } +} + +// IsWordExist method retrieves true when the given word is already in the rank. +func (rank *Rank) IsWordExist(word string) bool { + _, find := rank.WordValID[word] + + return find +} + +// AddNewWord method adds a new word to the rank object and it defines its ID. +func (rank *Rank) AddNewWord(word string, prevWordIdx int, sentenceID int) (wordID int) { + wordID = len(rank.Words) + connectionLeft := make(map[int]int) + + if prevWordIdx >= 0 { + connectionLeft[prevWordIdx] = 1 + } + + newWord := &Word{ + ID: wordID, + SentenceIDs: []int{sentenceID}, + ConnectionLeft: connectionLeft, + ConnectionRight: make(map[int]int), + Token: word, + Qty: 1, + Weight: 0, + } + + rank.Words[wordID] = newWord + rank.WordValID[word] = wordID + + return +} + +// UpdateWord method update a word what already exists in the rank object. It +// retrieves its ID. +func (rank *Rank) UpdateWord(word string, prevWordIdx int, sentenceID int) (wordID int) { + wordID = rank.WordValID[word] + + found := false + + for _, oldSentenceID := range rank.Words[wordID].SentenceIDs { + if sentenceID == oldSentenceID { + found = true + break + } + } + + if !found { + rank.Words[wordID].SentenceIDs = append( + rank.Words[wordID].SentenceIDs, + sentenceID, + ) + } + + rank.Words[wordID].Qty++ + + if prevWordIdx >= 0 { + rank.Words[wordID].ConnectionLeft[prevWordIdx]++ + } + + return +} + +// UpdateRightConnection method adds the right connection to the word. It always +// can be used after a word has added and the next word is known. +func (rank *Rank) UpdateRightConnection(wordID int, rightWordID int) { + if wordID >= 0 { + rank.Words[wordID].ConnectionRight[rightWordID]++ + } +} + +// GetWordData method retrieves all words as a pointer. +func (rank *Rank) GetWordData() map[int]*Word { + return rank.Words +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go new file mode 100644 index 0000000..5fd2dfa --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go @@ -0,0 +1,66 @@ +package rank + +// Calculate function ranking words by the given algorithm implementation. +func Calculate(ranks *Rank, algorithm Algorithm) { + updateRanks(ranks, algorithm) +} + +func updateRanks(ranks *Rank, algorithm Algorithm) { + for _, word := range ranks.Words { + weight := algorithm.WeightingHits(word.ID, ranks) + word.Weight = weight + + if ranks.Max < word.Weight { + ranks.Max = word.Weight + } + + if ranks.Min > word.Weight || ranks.Min == 0 { + ranks.Min = word.Weight + } + } + + for _, word := range ranks.Words { + word.Weight = normalize(word.Weight, ranks.Min, ranks.Max) + } + + for x, xMap := range ranks.Relation.Node { + for y := range xMap { + sentenceIDs := ranks.Relation.Node[x][y].SentenceIDs + weight := algorithm.WeightingRelation(x, y, ranks) + + ranks.Relation.Node[x][y] = Score{ + ranks.Relation.Node[x][y].Qty, + weight, + sentenceIDs, + } + + if ranks.Relation.Max < weight { + ranks.Relation.Max = weight + } + + if ranks.Relation.Min > weight || ranks.Relation.Min == 0 { + ranks.Relation.Min = weight + } + } + } + + for x, xMap := range ranks.Relation.Node { + for y := range xMap { + weight := normalize( + ranks.Relation.Node[x][y].Weight, + ranks.Relation.Min, + ranks.Relation.Max, + ) + + ranks.Relation.Node[x][y] = Score{ + ranks.Relation.Node[x][y].Qty, + weight, + ranks.Relation.Node[x][y].SentenceIDs, + } + } + } +} + +func normalize(weight float32, min float32, max float32) float32 { + return (weight - min) / (max - min) +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go new file mode 100644 index 0000000..cb8b97e --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go @@ -0,0 +1,77 @@ +package rank + +// Relation struct contains the phrase data. +// +// Max is the occurrence of the most used phrase. +// +// Min is the occurrence of the less used phrase. It is always greater then 0. +// +// Node is contains the Scores. Firs ID is the word 1, second ID is the word 2, +// and the value is the Score what contains the data about their relation. +type Relation struct { + Max float32 + Min float32 + Node map[int]map[int]Score +} + +// Score struct contains data about a relation of two words. +// +// Qty is the occurrence of the phrase. +// +// Weight is the weight of the phrase between 0.00 and 1.00. +// +// SentenceIDs contains all IDs of sentences what contain the phrase. +type Score struct { + Qty int + Weight float32 + SentenceIDs []int +} + +// AddRelation method adds a new relation to Relation object. +func (relation *Relation) AddRelation(wordID int, relatedWordID int, sentenceID int) { + if relatedWordID == -1 { + return + } + + if relation.updateRelation(relatedWordID, wordID, true, sentenceID) { + return + } + + if relation.extendRelation(wordID, relatedWordID, true, sentenceID) { + return + } + + relation.createRelation(wordID, relatedWordID, sentenceID) +} + +func (relation *Relation) updateRelation(x int, y int, r bool, sentenceID int) bool { + if _, ok := relation.Node[x][y]; ok { + count := relation.Node[x][y].Qty + 1 + weight := relation.Node[x][y].Weight + sentenceIDs := append(relation.Node[x][y].SentenceIDs, sentenceID) + relation.Node[x][y] = Score{count, weight, sentenceIDs} + + return true + } else if r { + return relation.updateRelation(y, x, false, sentenceID) + } + + return false +} + +func (relation *Relation) extendRelation(x int, y int, r bool, sentenceID int) bool { + if _, ok := relation.Node[x]; ok { + relation.Node[x][y] = Score{1, 0, []int{sentenceID}} + + return true + } else if r { + return relation.extendRelation(y, x, false, sentenceID) + } + + return false +} + +func (relation *Relation) createRelation(x int, y int, sentenceID int) { + relation.Node[x] = map[int]Score{} + relation.Node[x][y] = Score{1, 0, []int{sentenceID}} +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go new file mode 100644 index 0000000..6d00a97 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go @@ -0,0 +1,202 @@ +package rank + +import ( + "sort" +) + +// Phrase struct contains a single phrase and its data. +// +// LeftID is the ID of the word 1. +// +// RightID is the ID of the word 2. +// +// Left is the token of the word 1. +// +// Right is the token of the word 2. +// +// Weight is between 0.00 and 1.00. +// +// Qty is the occurrence of the phrase. +type Phrase struct { + LeftID int + RightID int + Left string + Right string + Weight float32 + Qty int +} + +// FindPhrases function has wrapper textrank.FindPhrases. Use the wrapper +// instead. +func FindPhrases(ranks *Rank) []Phrase { + var phrases []Phrase + + for x, xMap := range ranks.Relation.Node { + for y := range xMap { + phrases = append(phrases, Phrase{ + ranks.Words[x].ID, + ranks.Words[y].ID, + ranks.Words[x].Token, + ranks.Words[y].Token, + ranks.Relation.Node[x][y].Weight, + ranks.Relation.Node[x][y].Qty, + }) + } + } + + sort.Slice(phrases, func(i, j int) bool { + return phrases[i].Weight > phrases[j].Weight + }) + + return phrases +} + +// SingleWord struct contains a single word and its data. +// +// ID of the word. +// +// Word itself, the token. +// +// Weight of the word between 0.00 and 1.00. +// +// Quantity of the word. +type SingleWord struct { + ID int + Word string + Weight float32 + Qty int +} + +// FindSingleWords function has wrapper textrank.FindSingleWords. Use the +// wrapper instead. +func FindSingleWords(ranks *Rank) []SingleWord { + var singleWords []SingleWord + + for _, word := range ranks.Words { + singleWords = append(singleWords, SingleWord{ + word.ID, + word.Token, + word.Weight, + word.Qty, + }) + } + + sort.Slice(singleWords, func(i, j int) bool { + return singleWords[i].Weight > singleWords[j].Weight + }) + + return singleWords +} + +// Sentence struct contains a single sentence and its data. +type Sentence struct { + ID int + Value string +} + +// ByQty filter by occurrence of word. +const ByQty = 0 + +// ByRelation filter by phrase weight. +const ByRelation = 1 + +// FindSentences function has wrappers textrank.FindSentencesByRelationWeight +// and textrank.FindSentencesByWordQtyWeight. Use the wrappers instead. +func FindSentences(ranks *Rank, kind int, limit int) []Sentence { + var sentences []Sentence + + cache := make(map[int]bool) + + collect := func(sentenceIDs []int) bool { + for _, id := range sentenceIDs { + if len(sentences) >= limit { + return true + } + + if !cache[id] { + sentences = append(sentences, Sentence{id, ranks.SentenceMap[id]}) + cache[id] = true + } + } + + return false + } + + if kind == ByQty { + singleWords := FindSingleWords(ranks) + + for _, singleWord := range singleWords { + sentenceIDs := ranks.Words[singleWord.ID].SentenceIDs + + if collect(sentenceIDs) { + return sentences + } + } + } else if kind == ByRelation { + phrases := FindPhrases(ranks) + + for _, phrase := range phrases { + sentenceIDs := ranks.Relation.Node[phrase.LeftID][phrase.RightID].SentenceIDs + + if collect(sentenceIDs) { + return sentences + } + } + } + + return sentences +} + +// FindSentencesByPhrases function has wrapper +// textrank.FindSentencesByPhraseChain. Use the wrapper instead. +func FindSentencesByPhrases(ranks *Rank, words []string) []Sentence { + var sentences []Sentence + + reqMatch := len(words) - 1 + sentenceIDs := make(map[int]int) + + for _, i := range words { + for _, j := range words { + x := ranks.WordValID[i] + y := ranks.WordValID[j] + + if _, ok := ranks.Relation.Node[x][y]; ok { + curSentenceIDs := ranks.Relation.Node[x][y].SentenceIDs + + for _, id := range curSentenceIDs { + if _, ok := sentenceIDs[id]; ok { + sentenceIDs[id]++ + } else { + sentenceIDs[id] = 1 + } + } + } + } + } + + for sentenceID, v := range sentenceIDs { + if v >= reqMatch { + sentences = append(sentences, Sentence{sentenceID, ranks.SentenceMap[sentenceID]}) + } + } + + sort.Slice(sentences, func(i, j int) bool { + return sentences[i].ID < sentences[j].ID + }) + + return sentences +} + +// FindSentencesFrom function has wrapper textrank.FindSentencesFrom. Use the +// wrapper instead. +func FindSentencesFrom(ranks *Rank, id int, limit int) []Sentence { + var sentences []Sentence + + limit = id + limit - 1 + + for i := id; i <= limit; i++ { + sentences = append(sentences, Sentence{i, ranks.SentenceMap[i]}) + } + + return sentences +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go new file mode 100644 index 0000000..ed48ce3 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go @@ -0,0 +1,194 @@ +package textrank + +import ( + "github.com/DavidBelicza/TextRank/v2/convert" + "github.com/DavidBelicza/TextRank/v2/parse" + "github.com/DavidBelicza/TextRank/v2/rank" +) + +// TextRank structure contains the Rank data object. This structure is a wrapper +// around the whole text ranking functionality. +type TextRank struct { + rank *rank.Rank +} + +// NewTextRank constructor retrieves a TextRank pointer. This is the 1th step to +// use TextRank. +func NewTextRank() *TextRank { + return &TextRank{ + rank.NewRank(), + } +} + +// NewDefaultRule function retrieves a default Rule object what works in the +// most cases in English or similar Latin languages like French or Spanish. The +// Rule defines raw text how should be split to sentences and words. Because +// Rule is an interface it's possible modify the ranking by inject different +// Rule implementation. This is the 2nd step to use TextRank. +func NewDefaultRule() *parse.RuleDefault { + return parse.NewRule() +} + +// NewDefaultLanguage function retrieves a default Language object. It defines +// what words are real and what words are just Stop Words or useless Junk Words. +// It uses the default English Stop Words, but it's possible to set different +// Stop Words in English or any other languages. Because Language is an +// interface it's possible to modify the ranking by inject different Language +// implementation. This is the 3rd step to use TextRank. +func NewDefaultLanguage() *convert.LanguageDefault { + return convert.NewLanguage() +} + +// NewDefaultAlgorithm function retrieves an Algorithm object. It defines how +// should work the text ranking algorithm, the weighting. This is the general +// text rank by weighting the connection between the words to find the strongest +// phrases. Because Algorithm is an interface it's possible to modify the +// ranking algorithm by inject different implementation. This is the 4th step to +// use TextRank. +func NewDefaultAlgorithm() *rank.AlgorithmDefault { + return rank.NewAlgorithmDefault() +} + +// NewChainAlgorithm function retrieves an Algorithm object. It defines how +// should work the text ranking algorithm, the weighting. This is an alternative +// way to ranking words by weighting the number of the words. Because Algorithm +// is an interface it's possible to modify the ranking algorithm by inject +// different implementation. This is the 4th step to use TextRank. +func NewChainAlgorithm() *rank.AlgorithmChain { + return rank.NewAlgorithmChain() +} + +// Populate method adds a raw text to the text-ranking graph. It parses, +// tokenize the raw text and prepares it to weighting and scoring. It's possible +// to append a new raw text to an existing one even if the previously text is +// already ranked. This is 5th step to use TextRank. +// +// text string must be a plain text from TXT or PDF or any document, it can +// contain new lines, break lines or any unnecessary text parts, but it should +// not contain HTML tags or codes. +// +// lang Language object can be loaded from NewDefaultLanguage function. +// +// rule Rule object can be loaded from NewDefaultRule function. +func (textRank *TextRank) Populate( + text string, + lang convert.Language, + rule parse.Rule, +) { + parsedText := parse.TokenizeText(text, rule) + + for _, sentence := range parsedText.GetSentences() { + convert.TextToRank(sentence, lang, textRank.rank) + } +} + +// Ranking method counts the words and connections between the words, then it +// weights the numbers then normalize them in type float32 between 0.00 and +// 1.00. This is the 6th step to use TextRank. +// +// algorithm Algorithm is the object of the weighting and scoring methods. +func (textRank *TextRank) Ranking(algorithm rank.Algorithm) { + rank.Calculate(textRank.rank, algorithm) +} + +// GetRankData method retrieves the Rank data to that case if the developer want +// access to the whole graph and sentences, words, weights and all of the data +// to analyze it or just implement a new search logic or finder method. +func (textRank *TextRank) GetRankData() *rank.Rank { + return textRank.rank +} + +// FindPhrases function retrieves a slice of Phrase structures by TextRank +// object. The return value contains the sorted phrases with IDs, words, weights +// and quantities by weight from 1 to 0. Weight is calculated from quantities of +// relation between two words. A single phrase is from two words - not less and +// more. (But it's possible to find chain of phrases by +// FindSentencesByPhraseChain function.) +func FindPhrases(textRank *TextRank) []rank.Phrase { + return rank.FindPhrases(textRank.rank) +} + +// FindSingleWords function retrieves a slice of SingleWord structures by +// TextRank object. The return value contains the sorted words with IDs, words, +// weights and quantities by weight from 1 to 0. Weight is calculated from +// quantities of word. +func FindSingleWords(textRank *TextRank) []rank.SingleWord { + return rank.FindSingleWords(textRank.rank) +} + +// FindSentencesByRelationWeight function retrieves a slice of Sentence +// structures by TextRank object. The return value contains the ID of the +// sentence and the sentence text itself. The slice is sorted by weight of +// phrases from 1 to 0. +func FindSentencesByRelationWeight( + textRank *TextRank, + limit int, +) []rank.Sentence { + + return rank.FindSentences(textRank.rank, rank.ByRelation, limit) +} + +// FindSentencesByWordQtyWeight function retrieves a slice of Sentence +// structures by TextRank object. The return value contains the ID of the +// sentence and the sentence text itself. The slice is sorted by weight of word +// quantities from 1 to 0. +func FindSentencesByWordQtyWeight( + textRank *TextRank, + limit int, +) []rank.Sentence { + + return rank.FindSentences(textRank.rank, rank.ByQty, limit) +} + +// FindSentencesByPhraseChain function retrieves a slice of Sentence structures +// by TextRank object and slice of phrases. The return value contains the ID of +// the sentence and the sentence text itself. The slice is sorted by weight of +// word quantities from 1 to 0. +// +// textRank TextRank is the object of the TextRank. +// +// phrases []string is a slice of phrases. A single phrase is from two words, so +// when the slice contains 3 words the inner method will search for two phrases. +// The search algorithm seeks for "len(phrases)!". In case of three item the +// possible combination is 3 factorial (3!) = 3 * 2 * 1. +// +// rawText := "Long raw text, lorem ipsum..." +// rule := NewDefaultRule() +// language := NewDefaultLanguage() +// algorithm := NewDefaultAlgorithm() +// +// Append(rawText, language, rule, 1) +// Ranking(1, algorithm) +// +// FindSentencesByPhraseChain(1, []string{ +// "captain", +// "james", +// "kirk", +// }) +// +// The above code searches for captain james kirk, captain kirk james, james +// kirk captain, james captain kirk, kirk james captain and james kirk captain +// combinations in the graph. The 3 of words have to be related to each other +// in the same sentence but the search algorithm ignores the stop words. So if +// there is a sentence "James Kirk is the Captain of the Enterprise." the +// sentence will be returned because the words "is" and "the" are stop words. +func FindSentencesByPhraseChain( + textRank *TextRank, + phrases []string, +) []rank.Sentence { + + return rank.FindSentencesByPhrases(textRank.rank, phrases) +} + +// FindSentencesFrom function retrieves a slice of Sentence structures by +// TextRank object and by ID of the sentence. The return value contains the +// sentence text itself. The returned slice contains sentences sorted by their +// IDs started from the given sentence ID in ascending sort. +func FindSentencesFrom( + textRank *TextRank, + sentenceID int, + limit int, +) []rank.Sentence { + + return rank.FindSentencesFrom(textRank.rank, sentenceID, limit) +} |
