diff options
Diffstat (limited to 'vendor/github.com/DavidBelicza/TextRank/v2/rank')
5 files changed, 591 insertions, 0 deletions
diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go new file mode 100644 index 0000000..8f9345f --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/algorithm.go @@ -0,0 +1,99 @@ +package rank + +import ( + "math" +) + +// Algorithm interface and its methods make possible the polimorf usage of +// weighting process. +type Algorithm interface { + WeightingRelation( + word1ID int, + word2ID int, + rank *Rank, + ) float32 + + WeightingHits( + wordID int, + rank *Rank, + ) float32 +} + +// AlgorithmDefault struct is the basic implementation of Algorithm. It can +// weight a word or phrase by comparing them. +type AlgorithmDefault struct{} + +// NewAlgorithmDefault constructor retrieves an AlgorithmDefault pointer. +func NewAlgorithmDefault() *AlgorithmDefault { + return &AlgorithmDefault{} +} + +// WeightingRelation method is the traditional algorithm of text rank to +// weighting a phrase. +func (a *AlgorithmDefault) WeightingRelation( + word1ID int, + word2ID int, + rank *Rank, +) float32 { + relationQty := rank.Relation.Node[word1ID][word2ID].Qty + + return float32(relationQty) +} + +// WeightingHits method ranks the words by their occurrence. +func (a *AlgorithmDefault) WeightingHits( + wordID int, + rank *Rank, +) float32 { + weight := rank.Words[wordID].Qty + + return float32(weight) +} + +// AlgorithmChain struct is the combined implementation of Algorithm. It is a +// good example how weighting can be changed by a different implementations. It +// can weight a word or phrase by comparing them. +type AlgorithmChain struct{} + +// NewAlgorithmChain constructor retrieves an AlgorithmChain pointer. +func NewAlgorithmChain() *AlgorithmChain { + return &AlgorithmChain{} +} + +// WeightingRelation method is a combined algorithm of text rank and word +// occurrence, it weights a phrase. +func (a *AlgorithmChain) WeightingRelation( + word1ID int, + word2ID int, + rank *Rank, +) float32 { + relationQty := rank.Relation.Node[word1ID][word2ID].Qty + word1Qty := rank.Words[word1ID].Qty + word2Qty := rank.Words[word2ID].Qty + + qDiff := float32(math.Abs(float64(word1Qty)-float64(word2Qty))) / 100 + weight := float32(relationQty) + qDiff + + return weight +} + +// WeightingHits method ranks the words by their occurrence. +func (a *AlgorithmChain) WeightingHits( + wordID int, + rank *Rank, +) float32 { + word := rank.Words[wordID] + qty := 0 + + for leftWordID, leftWordQty := range word.ConnectionLeft { + qty += rank.Words[leftWordID].Qty * leftWordQty + } + + for rightWordID, rightWordQty := range word.ConnectionRight { + qty += rank.Words[rightWordID].Qty * rightWordQty + } + + weight := float32(word.Qty) + (float32(qty)) + + return float32(weight) +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go new file mode 100644 index 0000000..3bcef7c --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/rank.go @@ -0,0 +1,147 @@ +package rank + +// Rank struct contains every original raw sentences, words, tokens, phrases, +// indexes, word hits, phrase hits and minimum-maximum values. +// +// Max is the occurrence of the most used word. +// +// Min is the occurrence of the less used word. It is always greater then 0. +// +// Relation is the Relation object, contains phrases. +// +// SentenceMap contains raw sentences. Index is the sentence ID, value is the +// sentence itself. +// +// Words contains Word objects. Index is the word ID, value is the word/token +// itself. +// +// WordValID contains words. Index is the word/token, value is the ID. +type Rank struct { + Max float32 + Min float32 + Relation Relation + SentenceMap map[int]string + Words map[int]*Word + WordValID map[string]int +} + +// Word struct contains all data about the words. +// +// If a word is multiple times in the text then the multiple words point to the +// same ID. So Word is unique. +// +// SentenceIDs contains all IDs of sentences what contain the word. +// +// ConnectionLeft contains all words what are connected to this word on the left +// side. The map index is the ID of the related word and its value is the +// occurrence. +// +// ConnectionRight contains all words what are connected to this word on the +// right side. The map index is the ID of the related word and its value is the +// occurrence. +// +// Token is the word itself, but not the original, it is tokenized. +// +// Qty is the number of occurrence of the word. +// +// Weight is the weight of the word between 0.00 and 1.00. +type Word struct { + ID int + SentenceIDs []int + ConnectionLeft map[int]int + ConnectionRight map[int]int + Token string + Qty int + Weight float32 +} + +// NewRank constructor retrieves a Rank pointer. +func NewRank() *Rank { + return &Rank{ + 0, + 0, + Relation{ + 0, + 0, + make(map[int]map[int]Score), + }, + make(map[int]string), + make(map[int]*Word), + make(map[string]int), + } +} + +// IsWordExist method retrieves true when the given word is already in the rank. +func (rank *Rank) IsWordExist(word string) bool { + _, find := rank.WordValID[word] + + return find +} + +// AddNewWord method adds a new word to the rank object and it defines its ID. +func (rank *Rank) AddNewWord(word string, prevWordIdx int, sentenceID int) (wordID int) { + wordID = len(rank.Words) + connectionLeft := make(map[int]int) + + if prevWordIdx >= 0 { + connectionLeft[prevWordIdx] = 1 + } + + newWord := &Word{ + ID: wordID, + SentenceIDs: []int{sentenceID}, + ConnectionLeft: connectionLeft, + ConnectionRight: make(map[int]int), + Token: word, + Qty: 1, + Weight: 0, + } + + rank.Words[wordID] = newWord + rank.WordValID[word] = wordID + + return +} + +// UpdateWord method update a word what already exists in the rank object. It +// retrieves its ID. +func (rank *Rank) UpdateWord(word string, prevWordIdx int, sentenceID int) (wordID int) { + wordID = rank.WordValID[word] + + found := false + + for _, oldSentenceID := range rank.Words[wordID].SentenceIDs { + if sentenceID == oldSentenceID { + found = true + break + } + } + + if !found { + rank.Words[wordID].SentenceIDs = append( + rank.Words[wordID].SentenceIDs, + sentenceID, + ) + } + + rank.Words[wordID].Qty++ + + if prevWordIdx >= 0 { + rank.Words[wordID].ConnectionLeft[prevWordIdx]++ + } + + return +} + +// UpdateRightConnection method adds the right connection to the word. It always +// can be used after a word has added and the next word is known. +func (rank *Rank) UpdateRightConnection(wordID int, rightWordID int) { + if wordID >= 0 { + rank.Words[wordID].ConnectionRight[rightWordID]++ + } +} + +// GetWordData method retrieves all words as a pointer. +func (rank *Rank) GetWordData() map[int]*Word { + return rank.Words +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go new file mode 100644 index 0000000..5fd2dfa --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/ranking.go @@ -0,0 +1,66 @@ +package rank + +// Calculate function ranking words by the given algorithm implementation. +func Calculate(ranks *Rank, algorithm Algorithm) { + updateRanks(ranks, algorithm) +} + +func updateRanks(ranks *Rank, algorithm Algorithm) { + for _, word := range ranks.Words { + weight := algorithm.WeightingHits(word.ID, ranks) + word.Weight = weight + + if ranks.Max < word.Weight { + ranks.Max = word.Weight + } + + if ranks.Min > word.Weight || ranks.Min == 0 { + ranks.Min = word.Weight + } + } + + for _, word := range ranks.Words { + word.Weight = normalize(word.Weight, ranks.Min, ranks.Max) + } + + for x, xMap := range ranks.Relation.Node { + for y := range xMap { + sentenceIDs := ranks.Relation.Node[x][y].SentenceIDs + weight := algorithm.WeightingRelation(x, y, ranks) + + ranks.Relation.Node[x][y] = Score{ + ranks.Relation.Node[x][y].Qty, + weight, + sentenceIDs, + } + + if ranks.Relation.Max < weight { + ranks.Relation.Max = weight + } + + if ranks.Relation.Min > weight || ranks.Relation.Min == 0 { + ranks.Relation.Min = weight + } + } + } + + for x, xMap := range ranks.Relation.Node { + for y := range xMap { + weight := normalize( + ranks.Relation.Node[x][y].Weight, + ranks.Relation.Min, + ranks.Relation.Max, + ) + + ranks.Relation.Node[x][y] = Score{ + ranks.Relation.Node[x][y].Qty, + weight, + ranks.Relation.Node[x][y].SentenceIDs, + } + } + } +} + +func normalize(weight float32, min float32, max float32) float32 { + return (weight - min) / (max - min) +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go new file mode 100644 index 0000000..cb8b97e --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/relation.go @@ -0,0 +1,77 @@ +package rank + +// Relation struct contains the phrase data. +// +// Max is the occurrence of the most used phrase. +// +// Min is the occurrence of the less used phrase. It is always greater then 0. +// +// Node is contains the Scores. Firs ID is the word 1, second ID is the word 2, +// and the value is the Score what contains the data about their relation. +type Relation struct { + Max float32 + Min float32 + Node map[int]map[int]Score +} + +// Score struct contains data about a relation of two words. +// +// Qty is the occurrence of the phrase. +// +// Weight is the weight of the phrase between 0.00 and 1.00. +// +// SentenceIDs contains all IDs of sentences what contain the phrase. +type Score struct { + Qty int + Weight float32 + SentenceIDs []int +} + +// AddRelation method adds a new relation to Relation object. +func (relation *Relation) AddRelation(wordID int, relatedWordID int, sentenceID int) { + if relatedWordID == -1 { + return + } + + if relation.updateRelation(relatedWordID, wordID, true, sentenceID) { + return + } + + if relation.extendRelation(wordID, relatedWordID, true, sentenceID) { + return + } + + relation.createRelation(wordID, relatedWordID, sentenceID) +} + +func (relation *Relation) updateRelation(x int, y int, r bool, sentenceID int) bool { + if _, ok := relation.Node[x][y]; ok { + count := relation.Node[x][y].Qty + 1 + weight := relation.Node[x][y].Weight + sentenceIDs := append(relation.Node[x][y].SentenceIDs, sentenceID) + relation.Node[x][y] = Score{count, weight, sentenceIDs} + + return true + } else if r { + return relation.updateRelation(y, x, false, sentenceID) + } + + return false +} + +func (relation *Relation) extendRelation(x int, y int, r bool, sentenceID int) bool { + if _, ok := relation.Node[x]; ok { + relation.Node[x][y] = Score{1, 0, []int{sentenceID}} + + return true + } else if r { + return relation.extendRelation(y, x, false, sentenceID) + } + + return false +} + +func (relation *Relation) createRelation(x int, y int, sentenceID int) { + relation.Node[x] = map[int]Score{} + relation.Node[x][y] = Score{1, 0, []int{sentenceID}} +} diff --git a/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go b/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go new file mode 100644 index 0000000..6d00a97 --- /dev/null +++ b/vendor/github.com/DavidBelicza/TextRank/v2/rank/sorting.go @@ -0,0 +1,202 @@ +package rank + +import ( + "sort" +) + +// Phrase struct contains a single phrase and its data. +// +// LeftID is the ID of the word 1. +// +// RightID is the ID of the word 2. +// +// Left is the token of the word 1. +// +// Right is the token of the word 2. +// +// Weight is between 0.00 and 1.00. +// +// Qty is the occurrence of the phrase. +type Phrase struct { + LeftID int + RightID int + Left string + Right string + Weight float32 + Qty int +} + +// FindPhrases function has wrapper textrank.FindPhrases. Use the wrapper +// instead. +func FindPhrases(ranks *Rank) []Phrase { + var phrases []Phrase + + for x, xMap := range ranks.Relation.Node { + for y := range xMap { + phrases = append(phrases, Phrase{ + ranks.Words[x].ID, + ranks.Words[y].ID, + ranks.Words[x].Token, + ranks.Words[y].Token, + ranks.Relation.Node[x][y].Weight, + ranks.Relation.Node[x][y].Qty, + }) + } + } + + sort.Slice(phrases, func(i, j int) bool { + return phrases[i].Weight > phrases[j].Weight + }) + + return phrases +} + +// SingleWord struct contains a single word and its data. +// +// ID of the word. +// +// Word itself, the token. +// +// Weight of the word between 0.00 and 1.00. +// +// Quantity of the word. +type SingleWord struct { + ID int + Word string + Weight float32 + Qty int +} + +// FindSingleWords function has wrapper textrank.FindSingleWords. Use the +// wrapper instead. +func FindSingleWords(ranks *Rank) []SingleWord { + var singleWords []SingleWord + + for _, word := range ranks.Words { + singleWords = append(singleWords, SingleWord{ + word.ID, + word.Token, + word.Weight, + word.Qty, + }) + } + + sort.Slice(singleWords, func(i, j int) bool { + return singleWords[i].Weight > singleWords[j].Weight + }) + + return singleWords +} + +// Sentence struct contains a single sentence and its data. +type Sentence struct { + ID int + Value string +} + +// ByQty filter by occurrence of word. +const ByQty = 0 + +// ByRelation filter by phrase weight. +const ByRelation = 1 + +// FindSentences function has wrappers textrank.FindSentencesByRelationWeight +// and textrank.FindSentencesByWordQtyWeight. Use the wrappers instead. +func FindSentences(ranks *Rank, kind int, limit int) []Sentence { + var sentences []Sentence + + cache := make(map[int]bool) + + collect := func(sentenceIDs []int) bool { + for _, id := range sentenceIDs { + if len(sentences) >= limit { + return true + } + + if !cache[id] { + sentences = append(sentences, Sentence{id, ranks.SentenceMap[id]}) + cache[id] = true + } + } + + return false + } + + if kind == ByQty { + singleWords := FindSingleWords(ranks) + + for _, singleWord := range singleWords { + sentenceIDs := ranks.Words[singleWord.ID].SentenceIDs + + if collect(sentenceIDs) { + return sentences + } + } + } else if kind == ByRelation { + phrases := FindPhrases(ranks) + + for _, phrase := range phrases { + sentenceIDs := ranks.Relation.Node[phrase.LeftID][phrase.RightID].SentenceIDs + + if collect(sentenceIDs) { + return sentences + } + } + } + + return sentences +} + +// FindSentencesByPhrases function has wrapper +// textrank.FindSentencesByPhraseChain. Use the wrapper instead. +func FindSentencesByPhrases(ranks *Rank, words []string) []Sentence { + var sentences []Sentence + + reqMatch := len(words) - 1 + sentenceIDs := make(map[int]int) + + for _, i := range words { + for _, j := range words { + x := ranks.WordValID[i] + y := ranks.WordValID[j] + + if _, ok := ranks.Relation.Node[x][y]; ok { + curSentenceIDs := ranks.Relation.Node[x][y].SentenceIDs + + for _, id := range curSentenceIDs { + if _, ok := sentenceIDs[id]; ok { + sentenceIDs[id]++ + } else { + sentenceIDs[id] = 1 + } + } + } + } + } + + for sentenceID, v := range sentenceIDs { + if v >= reqMatch { + sentences = append(sentences, Sentence{sentenceID, ranks.SentenceMap[sentenceID]}) + } + } + + sort.Slice(sentences, func(i, j int) bool { + return sentences[i].ID < sentences[j].ID + }) + + return sentences +} + +// FindSentencesFrom function has wrapper textrank.FindSentencesFrom. Use the +// wrapper instead. +func FindSentencesFrom(ranks *Rank, id int, limit int) []Sentence { + var sentences []Sentence + + limit = id + limit - 1 + + for i := id; i <= limit; i++ { + sentences = append(sentences, Sentence{i, ranks.SentenceMap[i]}) + } + + return sentences +} |
