1package html
2
3import (
4 "github.com/tdewolff/parse/v2"
5 "github.com/tdewolff/parse/v2/html"
6)
7
8// Token is a single token unit with an attribute value (if given) and hash of the data.
9type Token struct {
10 html.TokenType
11 Hash Hash
12 Data []byte
13 Text []byte
14 AttrVal []byte
15 Traits traits
16 Offset int
17}
18
19// TokenBuffer is a buffer that allows for token look-ahead.
20type TokenBuffer struct {
21 r *parse.Input
22 l *html.Lexer
23
24 buf []Token
25 pos int
26
27 attrBuffer []*Token
28}
29
30// NewTokenBuffer returns a new TokenBuffer.
31func NewTokenBuffer(r *parse.Input, l *html.Lexer) *TokenBuffer {
32 return &TokenBuffer{
33 r: r,
34 l: l,
35 buf: make([]Token, 0, 8),
36 }
37}
38
39func (z *TokenBuffer) read(t *Token) {
40 t.Offset = z.r.Offset()
41 t.TokenType, t.Data = z.l.Next()
42 t.Text = z.l.Text()
43 if t.TokenType == html.AttributeToken {
44 t.Offset += 1 + len(t.Text) + 1
45 t.AttrVal = z.l.AttrVal()
46 if len(t.AttrVal) > 1 && (t.AttrVal[0] == '"' || t.AttrVal[0] == '\'') {
47 t.Offset++
48 t.AttrVal = t.AttrVal[1 : len(t.AttrVal)-1] // quotes will be readded in attribute loop if necessary
49 }
50 t.Hash = ToHash(t.Text)
51 t.Traits = attrMap[t.Hash]
52 } else if t.TokenType == html.StartTagToken || t.TokenType == html.EndTagToken {
53 t.AttrVal = nil
54 t.Hash = ToHash(t.Text)
55 t.Traits = tagMap[t.Hash] // zero if not exist
56 } else {
57 t.AttrVal = nil
58 t.Hash = 0
59 t.Traits = 0
60 }
61}
62
63// Peek returns the ith element and possibly does an allocation.
64// Peeking past an error will panic.
65func (z *TokenBuffer) Peek(pos int) *Token {
66 pos += z.pos
67 if pos >= len(z.buf) {
68 if len(z.buf) > 0 && z.buf[len(z.buf)-1].TokenType == html.ErrorToken {
69 return &z.buf[len(z.buf)-1]
70 }
71
72 c := cap(z.buf)
73 d := len(z.buf) - z.pos
74 p := pos - z.pos + 1 // required peek length
75 var buf []Token
76 if 2*p > c {
77 buf = make([]Token, 0, 2*c+p)
78 } else {
79 buf = z.buf
80 }
81 copy(buf[:d], z.buf[z.pos:])
82
83 buf = buf[:p]
84 pos -= z.pos
85 for i := d; i < p; i++ {
86 z.read(&buf[i])
87 if buf[i].TokenType == html.ErrorToken {
88 buf = buf[:i+1]
89 pos = i
90 break
91 }
92 }
93 z.pos, z.buf = 0, buf
94 }
95 return &z.buf[pos]
96}
97
98// Shift returns the first element and advances position.
99func (z *TokenBuffer) Shift() *Token {
100 if z.pos >= len(z.buf) {
101 t := &z.buf[:1][0]
102 z.read(t)
103 return t
104 }
105 t := &z.buf[z.pos]
106 z.pos++
107 return t
108}
109
110// Attributes extracts the gives attribute hashes from a tag.
111// It returns in the same order pointers to the requested token data or nil.
112func (z *TokenBuffer) Attributes(hashes ...Hash) []*Token {
113 n := 0
114 for {
115 if t := z.Peek(n); t.TokenType != html.AttributeToken {
116 break
117 }
118 n++
119 }
120 if len(hashes) > cap(z.attrBuffer) {
121 z.attrBuffer = make([]*Token, len(hashes))
122 } else {
123 z.attrBuffer = z.attrBuffer[:len(hashes)]
124 for i := range z.attrBuffer {
125 z.attrBuffer[i] = nil
126 }
127 }
128 for i := z.pos; i < z.pos+n; i++ {
129 attr := &z.buf[i]
130 for j, hash := range hashes {
131 if hash == attr.Hash {
132 z.attrBuffer[j] = attr
133 }
134 }
135 }
136 return z.attrBuffer
137}