1package rank
2
3// Rank struct contains every original raw sentences, words, tokens, phrases,
4// indexes, word hits, phrase hits and minimum-maximum values.
5//
6// Max is the occurrence of the most used word.
7//
8// Min is the occurrence of the less used word. It is always greater then 0.
9//
10// Relation is the Relation object, contains phrases.
11//
12// SentenceMap contains raw sentences. Index is the sentence ID, value is the
13// sentence itself.
14//
15// Words contains Word objects. Index is the word ID, value is the word/token
16// itself.
17//
18// WordValID contains words. Index is the word/token, value is the ID.
19type Rank struct {
20 Max float32
21 Min float32
22 Relation Relation
23 SentenceMap map[int]string
24 Words map[int]*Word
25 WordValID map[string]int
26}
27
28// Word struct contains all data about the words.
29//
30// If a word is multiple times in the text then the multiple words point to the
31// same ID. So Word is unique.
32//
33// SentenceIDs contains all IDs of sentences what contain the word.
34//
35// ConnectionLeft contains all words what are connected to this word on the left
36// side. The map index is the ID of the related word and its value is the
37// occurrence.
38//
39// ConnectionRight contains all words what are connected to this word on the
40// right side. The map index is the ID of the related word and its value is the
41// occurrence.
42//
43// Token is the word itself, but not the original, it is tokenized.
44//
45// Qty is the number of occurrence of the word.
46//
47// Weight is the weight of the word between 0.00 and 1.00.
48type Word struct {
49 ID int
50 SentenceIDs []int
51 ConnectionLeft map[int]int
52 ConnectionRight map[int]int
53 Token string
54 Qty int
55 Weight float32
56}
57
58// NewRank constructor retrieves a Rank pointer.
59func NewRank() *Rank {
60 return &Rank{
61 0,
62 0,
63 Relation{
64 0,
65 0,
66 make(map[int]map[int]Score),
67 },
68 make(map[int]string),
69 make(map[int]*Word),
70 make(map[string]int),
71 }
72}
73
74// IsWordExist method retrieves true when the given word is already in the rank.
75func (rank *Rank) IsWordExist(word string) bool {
76 _, find := rank.WordValID[word]
77
78 return find
79}
80
81// AddNewWord method adds a new word to the rank object and it defines its ID.
82func (rank *Rank) AddNewWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
83 wordID = len(rank.Words)
84 connectionLeft := make(map[int]int)
85
86 if prevWordIdx >= 0 {
87 connectionLeft[prevWordIdx] = 1
88 }
89
90 newWord := &Word{
91 ID: wordID,
92 SentenceIDs: []int{sentenceID},
93 ConnectionLeft: connectionLeft,
94 ConnectionRight: make(map[int]int),
95 Token: word,
96 Qty: 1,
97 Weight: 0,
98 }
99
100 rank.Words[wordID] = newWord
101 rank.WordValID[word] = wordID
102
103 return
104}
105
106// UpdateWord method update a word what already exists in the rank object. It
107// retrieves its ID.
108func (rank *Rank) UpdateWord(word string, prevWordIdx int, sentenceID int) (wordID int) {
109 wordID = rank.WordValID[word]
110
111 found := false
112
113 for _, oldSentenceID := range rank.Words[wordID].SentenceIDs {
114 if sentenceID == oldSentenceID {
115 found = true
116 break
117 }
118 }
119
120 if !found {
121 rank.Words[wordID].SentenceIDs = append(
122 rank.Words[wordID].SentenceIDs,
123 sentenceID,
124 )
125 }
126
127 rank.Words[wordID].Qty++
128
129 if prevWordIdx >= 0 {
130 rank.Words[wordID].ConnectionLeft[prevWordIdx]++
131 }
132
133 return
134}
135
136// UpdateRightConnection method adds the right connection to the word. It always
137// can be used after a word has added and the next word is known.
138func (rank *Rank) UpdateRightConnection(wordID int, rightWordID int) {
139 if wordID >= 0 {
140 rank.Words[wordID].ConnectionRight[rightWordID]++
141 }
142}
143
144// GetWordData method retrieves all words as a pointer.
145func (rank *Rank) GetWordData() map[int]*Word {
146 return rank.Words
147}