diff options
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/textrank.go')
| -rw-r--r-- | vendor/github.com/DavidBelicza/TextRank/v2/textrank.go | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go new file mode 100644 index 0000000..ed48ce3 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/textrank.go | |||
| @@ -0,0 +1,194 @@ | |||
| 1 | package textrank | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "github.com/DavidBelicza/TextRank/v2/convert" | ||
| 5 | "github.com/DavidBelicza/TextRank/v2/parse" | ||
| 6 | "github.com/DavidBelicza/TextRank/v2/rank" | ||
| 7 | ) | ||
| 8 | |||
| 9 | // TextRank structure contains the Rank data object. This structure is a wrapper | ||
| 10 | // around the whole text ranking functionality. | ||
| 11 | type TextRank struct { | ||
| 12 | rank *rank.Rank | ||
| 13 | } | ||
| 14 | |||
| 15 | // NewTextRank constructor retrieves a TextRank pointer. This is the 1th step to | ||
| 16 | // use TextRank. | ||
| 17 | func NewTextRank() *TextRank { | ||
| 18 | return &TextRank{ | ||
| 19 | rank.NewRank(), | ||
| 20 | } | ||
| 21 | } | ||
| 22 | |||
| 23 | // NewDefaultRule function retrieves a default Rule object what works in the | ||
| 24 | // most cases in English or similar Latin languages like French or Spanish. The | ||
| 25 | // Rule defines raw text how should be split to sentences and words. Because | ||
| 26 | // Rule is an interface it's possible modify the ranking by inject different | ||
| 27 | // Rule implementation. This is the 2nd step to use TextRank. | ||
| 28 | func NewDefaultRule() *parse.RuleDefault { | ||
| 29 | return parse.NewRule() | ||
| 30 | } | ||
| 31 | |||
| 32 | // NewDefaultLanguage function retrieves a default Language object. It defines | ||
| 33 | // what words are real and what words are just Stop Words or useless Junk Words. | ||
| 34 | // It uses the default English Stop Words, but it's possible to set different | ||
| 35 | // Stop Words in English or any other languages. Because Language is an | ||
| 36 | // interface it's possible to modify the ranking by inject different Language | ||
| 37 | // implementation. This is the 3rd step to use TextRank. | ||
| 38 | func NewDefaultLanguage() *convert.LanguageDefault { | ||
| 39 | return convert.NewLanguage() | ||
| 40 | } | ||
| 41 | |||
| 42 | // NewDefaultAlgorithm function retrieves an Algorithm object. It defines how | ||
| 43 | // should work the text ranking algorithm, the weighting. This is the general | ||
| 44 | // text rank by weighting the connection between the words to find the strongest | ||
| 45 | // phrases. Because Algorithm is an interface it's possible to modify the | ||
| 46 | // ranking algorithm by inject different implementation. This is the 4th step to | ||
| 47 | // use TextRank. | ||
| 48 | func NewDefaultAlgorithm() *rank.AlgorithmDefault { | ||
| 49 | return rank.NewAlgorithmDefault() | ||
| 50 | } | ||
| 51 | |||
| 52 | // NewChainAlgorithm function retrieves an Algorithm object. It defines how | ||
| 53 | // should work the text ranking algorithm, the weighting. This is an alternative | ||
| 54 | // way to ranking words by weighting the number of the words. Because Algorithm | ||
| 55 | // is an interface it's possible to modify the ranking algorithm by inject | ||
| 56 | // different implementation. This is the 4th step to use TextRank. | ||
| 57 | func NewChainAlgorithm() *rank.AlgorithmChain { | ||
| 58 | return rank.NewAlgorithmChain() | ||
| 59 | } | ||
| 60 | |||
| 61 | // Populate method adds a raw text to the text-ranking graph. It parses, | ||
| 62 | // tokenize the raw text and prepares it to weighting and scoring. It's possible | ||
| 63 | // to append a new raw text to an existing one even if the previously text is | ||
| 64 | // already ranked. This is 5th step to use TextRank. | ||
| 65 | // | ||
| 66 | // text string must be a plain text from TXT or PDF or any document, it can | ||
| 67 | // contain new lines, break lines or any unnecessary text parts, but it should | ||
| 68 | // not contain HTML tags or codes. | ||
| 69 | // | ||
| 70 | // lang Language object can be loaded from NewDefaultLanguage function. | ||
| 71 | // | ||
| 72 | // rule Rule object can be loaded from NewDefaultRule function. | ||
| 73 | func (textRank *TextRank) Populate( | ||
| 74 | text string, | ||
| 75 | lang convert.Language, | ||
| 76 | rule parse.Rule, | ||
| 77 | ) { | ||
| 78 | parsedText := parse.TokenizeText(text, rule) | ||
| 79 | |||
| 80 | for _, sentence := range parsedText.GetSentences() { | ||
| 81 | convert.TextToRank(sentence, lang, textRank.rank) | ||
| 82 | } | ||
| 83 | } | ||
| 84 | |||
| 85 | // Ranking method counts the words and connections between the words, then it | ||
| 86 | // weights the numbers then normalize them in type float32 between 0.00 and | ||
| 87 | // 1.00. This is the 6th step to use TextRank. | ||
| 88 | // | ||
| 89 | // algorithm Algorithm is the object of the weighting and scoring methods. | ||
| 90 | func (textRank *TextRank) Ranking(algorithm rank.Algorithm) { | ||
| 91 | rank.Calculate(textRank.rank, algorithm) | ||
| 92 | } | ||
| 93 | |||
| 94 | // GetRankData method retrieves the Rank data to that case if the developer want | ||
| 95 | // access to the whole graph and sentences, words, weights and all of the data | ||
| 96 | // to analyze it or just implement a new search logic or finder method. | ||
| 97 | func (textRank *TextRank) GetRankData() *rank.Rank { | ||
| 98 | return textRank.rank | ||
| 99 | } | ||
| 100 | |||
| 101 | // FindPhrases function retrieves a slice of Phrase structures by TextRank | ||
| 102 | // object. The return value contains the sorted phrases with IDs, words, weights | ||
| 103 | // and quantities by weight from 1 to 0. Weight is calculated from quantities of | ||
| 104 | // relation between two words. A single phrase is from two words - not less and | ||
| 105 | // more. (But it's possible to find chain of phrases by | ||
| 106 | // FindSentencesByPhraseChain function.) | ||
| 107 | func FindPhrases(textRank *TextRank) []rank.Phrase { | ||
| 108 | return rank.FindPhrases(textRank.rank) | ||
| 109 | } | ||
| 110 | |||
| 111 | // FindSingleWords function retrieves a slice of SingleWord structures by | ||
| 112 | // TextRank object. The return value contains the sorted words with IDs, words, | ||
| 113 | // weights and quantities by weight from 1 to 0. Weight is calculated from | ||
| 114 | // quantities of word. | ||
| 115 | func FindSingleWords(textRank *TextRank) []rank.SingleWord { | ||
| 116 | return rank.FindSingleWords(textRank.rank) | ||
| 117 | } | ||
| 118 | |||
| 119 | // FindSentencesByRelationWeight function retrieves a slice of Sentence | ||
| 120 | // structures by TextRank object. The return value contains the ID of the | ||
| 121 | // sentence and the sentence text itself. The slice is sorted by weight of | ||
| 122 | // phrases from 1 to 0. | ||
| 123 | func FindSentencesByRelationWeight( | ||
| 124 | textRank *TextRank, | ||
| 125 | limit int, | ||
| 126 | ) []rank.Sentence { | ||
| 127 | |||
| 128 | return rank.FindSentences(textRank.rank, rank.ByRelation, limit) | ||
| 129 | } | ||
| 130 | |||
| 131 | // FindSentencesByWordQtyWeight function retrieves a slice of Sentence | ||
| 132 | // structures by TextRank object. The return value contains the ID of the | ||
| 133 | // sentence and the sentence text itself. The slice is sorted by weight of word | ||
| 134 | // quantities from 1 to 0. | ||
| 135 | func FindSentencesByWordQtyWeight( | ||
| 136 | textRank *TextRank, | ||
| 137 | limit int, | ||
| 138 | ) []rank.Sentence { | ||
| 139 | |||
| 140 | return rank.FindSentences(textRank.rank, rank.ByQty, limit) | ||
| 141 | } | ||
| 142 | |||
| 143 | // FindSentencesByPhraseChain function retrieves a slice of Sentence structures | ||
| 144 | // by TextRank object and slice of phrases. The return value contains the ID of | ||
| 145 | // the sentence and the sentence text itself. The slice is sorted by weight of | ||
| 146 | // word quantities from 1 to 0. | ||
| 147 | // | ||
| 148 | // textRank TextRank is the object of the TextRank. | ||
| 149 | // | ||
| 150 | // phrases []string is a slice of phrases. A single phrase is from two words, so | ||
| 151 | // when the slice contains 3 words the inner method will search for two phrases. | ||
| 152 | // The search algorithm seeks for "len(phrases)!". In case of three item the | ||
| 153 | // possible combination is 3 factorial (3!) = 3 * 2 * 1. | ||
| 154 | // | ||
| 155 | // rawText := "Long raw text, lorem ipsum..." | ||
| 156 | // rule := NewDefaultRule() | ||
| 157 | // language := NewDefaultLanguage() | ||
| 158 | // algorithm := NewDefaultAlgorithm() | ||
| 159 | // | ||
| 160 | // Append(rawText, language, rule, 1) | ||
| 161 | // Ranking(1, algorithm) | ||
| 162 | // | ||
| 163 | // FindSentencesByPhraseChain(1, []string{ | ||
| 164 | // "captain", | ||
| 165 | // "james", | ||
| 166 | // "kirk", | ||
| 167 | // }) | ||
| 168 | // | ||
| 169 | // The above code searches for captain james kirk, captain kirk james, james | ||
| 170 | // kirk captain, james captain kirk, kirk james captain and james kirk captain | ||
| 171 | // combinations in the graph. The 3 of words have to be related to each other | ||
| 172 | // in the same sentence but the search algorithm ignores the stop words. So if | ||
| 173 | // there is a sentence "James Kirk is the Captain of the Enterprise." the | ||
| 174 | // sentence will be returned because the words "is" and "the" are stop words. | ||
| 175 | func FindSentencesByPhraseChain( | ||
| 176 | textRank *TextRank, | ||
| 177 | phrases []string, | ||
| 178 | ) []rank.Sentence { | ||
| 179 | |||
| 180 | return rank.FindSentencesByPhrases(textRank.rank, phrases) | ||
| 181 | } | ||
| 182 | |||
| 183 | // FindSentencesFrom function retrieves a slice of Sentence structures by | ||
| 184 | // TextRank object and by ID of the sentence. The return value contains the | ||
| 185 | // sentence text itself. The returned slice contains sentences sorted by their | ||
| 186 | // IDs started from the given sentence ID in ascending sort. | ||
| 187 | func FindSentencesFrom( | ||
| 188 | textRank *TextRank, | ||
| 189 | sentenceID int, | ||
| 190 | limit int, | ||
| 191 | ) []rank.Sentence { | ||
| 192 | |||
| 193 | return rank.FindSentencesFrom(textRank.rank, sentenceID, limit) | ||
| 194 | } | ||
