1package parse
2
3import (
4 "strings"
5)
6
7// TokenizeText function use the given raw text and parses by a Rule object and
8// retrieves the parsed text in a Text struct object.
9func TokenizeText(rawText string, rule Rule) Text {
10 return findSentences(rawText, rule)
11}
12
13func findSentences(rawText string, rule Rule) Text {
14 text := Text{}
15
16 var sentence string
17 var i int
18 slen := len(rawText)
19
20 for j, chr := range rawText {
21 j += len(string(chr))
22 //when separator or the last
23 if rule.IsSentenceSeparator(chr) || j == slen {
24 sentence = rawText[i:j]
25 if len(sentence) > 0 {
26 text.Append(sentence, findWords(sentence, rule))
27 }
28
29 sentence = ""
30 i = j
31 }
32 }
33
34 return text
35}
36
37func findWords(rawSentence string, rule Rule) (words []string) {
38 words = []string{}
39
40 var word string
41 var i int
42 slen := len(rawSentence)
43
44 for j, chr := range rawSentence {
45 chrlen := len(string(chr))
46 j += chrlen
47 //when separator or the last
48 if sep := rule.IsWordSeparator(chr); sep || j == slen {
49 if sep {
50 word = rawSentence[i : j-chrlen]
51 } else {
52 word = rawSentence[i:j]
53 }
54 if len(word) > 0 {
55 words = append(words, strings.ToLower(word))
56 }
57 word = ""
58 i = j
59 }
60 }
61
62 return
63}