1package parse
 2
 3import (
 4	"strings"
 5)
 6
 7// TokenizeText function use the given raw text and parses by a Rule object and
 8// retrieves the parsed text in a Text struct object.
 9func TokenizeText(rawText string, rule Rule) Text {
10	return findSentences(rawText, rule)
11}
12
13func findSentences(rawText string, rule Rule) Text {
14	text := Text{}
15
16	var sentence string
17	var i int
18	slen := len(rawText)
19
20	for j, chr := range rawText {
21		j += len(string(chr))
22		//when separator or the last
23		if rule.IsSentenceSeparator(chr) || j == slen {
24			sentence = rawText[i:j]
25			if len(sentence) > 0 {
26				text.Append(sentence, findWords(sentence, rule))
27			}
28
29			sentence = ""
30			i = j
31		}
32	}
33
34	return text
35}
36
37func findWords(rawSentence string, rule Rule) (words []string) {
38	words = []string{}
39
40	var word string
41	var i int
42	slen := len(rawSentence)
43
44	for j, chr := range rawSentence {
45		chrlen := len(string(chr))
46		j += chrlen
47		//when separator or the last
48		if sep := rule.IsWordSeparator(chr); sep || j == slen {
49			if sep {
50				word = rawSentence[i : j-chrlen]
51			} else {
52				word = rawSentence[i:j]
53			}
54			if len(word) > 0 {
55				words = append(words, strings.ToLower(word))
56			}
57			word = ""
58			i = j
59		}
60	}
61
62	return
63}