1package html
  2
  3import (
  4	"github.com/tdewolff/parse/v2"
  5	"github.com/tdewolff/parse/v2/html"
  6)
  7
  8// Token is a single token unit with an attribute value (if given) and hash of the data.
  9type Token struct {
 10	html.TokenType
 11	Hash    Hash
 12	Data    []byte
 13	Text    []byte
 14	AttrVal []byte
 15	Traits  traits
 16	Offset  int
 17}
 18
 19// TokenBuffer is a buffer that allows for token look-ahead.
 20type TokenBuffer struct {
 21	r *parse.Input
 22	l *html.Lexer
 23
 24	buf []Token
 25	pos int
 26
 27	attrBuffer []*Token
 28}
 29
 30// NewTokenBuffer returns a new TokenBuffer.
 31func NewTokenBuffer(r *parse.Input, l *html.Lexer) *TokenBuffer {
 32	return &TokenBuffer{
 33		r:   r,
 34		l:   l,
 35		buf: make([]Token, 0, 8),
 36	}
 37}
 38
 39func (z *TokenBuffer) read(t *Token) {
 40	t.Offset = z.r.Offset()
 41	t.TokenType, t.Data = z.l.Next()
 42	t.Text = z.l.Text()
 43	if t.TokenType == html.AttributeToken {
 44		t.Offset += 1 + len(t.Text) + 1
 45		t.AttrVal = z.l.AttrVal()
 46		if len(t.AttrVal) > 1 && (t.AttrVal[0] == '"' || t.AttrVal[0] == '\'') {
 47			t.Offset++
 48			t.AttrVal = t.AttrVal[1 : len(t.AttrVal)-1] // quotes will be readded in attribute loop if necessary
 49		}
 50		t.Hash = ToHash(t.Text)
 51		t.Traits = attrMap[t.Hash]
 52	} else if t.TokenType == html.StartTagToken || t.TokenType == html.EndTagToken {
 53		t.AttrVal = nil
 54		t.Hash = ToHash(t.Text)
 55		t.Traits = tagMap[t.Hash] // zero if not exist
 56	} else {
 57		t.AttrVal = nil
 58		t.Hash = 0
 59		t.Traits = 0
 60	}
 61}
 62
 63// Peek returns the ith element and possibly does an allocation.
 64// Peeking past an error will panic.
 65func (z *TokenBuffer) Peek(pos int) *Token {
 66	pos += z.pos
 67	if pos >= len(z.buf) {
 68		if len(z.buf) > 0 && z.buf[len(z.buf)-1].TokenType == html.ErrorToken {
 69			return &z.buf[len(z.buf)-1]
 70		}
 71
 72		c := cap(z.buf)
 73		d := len(z.buf) - z.pos
 74		p := pos - z.pos + 1 // required peek length
 75		var buf []Token
 76		if 2*p > c {
 77			buf = make([]Token, 0, 2*c+p)
 78		} else {
 79			buf = z.buf
 80		}
 81		copy(buf[:d], z.buf[z.pos:])
 82
 83		buf = buf[:p]
 84		pos -= z.pos
 85		for i := d; i < p; i++ {
 86			z.read(&buf[i])
 87			if buf[i].TokenType == html.ErrorToken {
 88				buf = buf[:i+1]
 89				pos = i
 90				break
 91			}
 92		}
 93		z.pos, z.buf = 0, buf
 94	}
 95	return &z.buf[pos]
 96}
 97
 98// Shift returns the first element and advances position.
 99func (z *TokenBuffer) Shift() *Token {
100	if z.pos >= len(z.buf) {
101		t := &z.buf[:1][0]
102		z.read(t)
103		return t
104	}
105	t := &z.buf[z.pos]
106	z.pos++
107	return t
108}
109
110// Attributes extracts the gives attribute hashes from a tag.
111// It returns in the same order pointers to the requested token data or nil.
112func (z *TokenBuffer) Attributes(hashes ...Hash) []*Token {
113	n := 0
114	for {
115		if t := z.Peek(n); t.TokenType != html.AttributeToken {
116			break
117		}
118		n++
119	}
120	if len(hashes) > cap(z.attrBuffer) {
121		z.attrBuffer = make([]*Token, len(hashes))
122	} else {
123		z.attrBuffer = z.attrBuffer[:len(hashes)]
124		for i := range z.attrBuffer {
125			z.attrBuffer[i] = nil
126		}
127	}
128	for i := z.pos; i < z.pos+n; i++ {
129		attr := &z.buf[i]
130		for j, hash := range hashes {
131			if hash == attr.Hash {
132				z.attrBuffer[j] = attr
133			}
134		}
135	}
136	return z.attrBuffer
137}