1// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2package html
  3
  4import (
  5	"strconv"
  6
  7	"github.com/tdewolff/parse/v2"
  8)
  9
 10// TokenType determines the type of token, eg. a number or a semicolon.
 11type TokenType uint32
 12
 13// TokenType values.
 14const (
 15	ErrorToken TokenType = iota // extra token when errors occur
 16	CommentToken
 17	DoctypeToken
 18	StartTagToken
 19	StartTagCloseToken
 20	StartTagVoidToken
 21	EndTagToken
 22	AttributeToken
 23	TextToken
 24	SvgToken
 25	MathToken
 26)
 27
 28// String returns the string representation of a TokenType.
 29func (tt TokenType) String() string {
 30	switch tt {
 31	case ErrorToken:
 32		return "Error"
 33	case CommentToken:
 34		return "Comment"
 35	case DoctypeToken:
 36		return "Doctype"
 37	case StartTagToken:
 38		return "StartTag"
 39	case StartTagCloseToken:
 40		return "StartTagClose"
 41	case StartTagVoidToken:
 42		return "StartTagVoid"
 43	case EndTagToken:
 44		return "EndTag"
 45	case AttributeToken:
 46		return "Attribute"
 47	case TextToken:
 48		return "Text"
 49	case SvgToken:
 50		return "Svg"
 51	case MathToken:
 52		return "Math"
 53	}
 54	return "Invalid(" + strconv.Itoa(int(tt)) + ")"
 55}
 56
 57////////////////////////////////////////////////////////////////
 58
 59// Lexer is the state for the lexer.
 60type Lexer struct {
 61	r   *parse.Input
 62	err error
 63
 64	rawTag Hash
 65	inTag  bool
 66
 67	text    []byte
 68	attrVal []byte
 69}
 70
 71// NewLexer returns a new Lexer for a given io.Reader.
 72func NewLexer(r *parse.Input) *Lexer {
 73	return &Lexer{
 74		r: r,
 75	}
 76}
 77
 78// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
 79func (l *Lexer) Err() error {
 80	if l.err != nil {
 81		return l.err
 82	}
 83	return l.r.Err()
 84}
 85
 86// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
 87func (l *Lexer) Text() []byte {
 88	return l.text
 89}
 90
 91// AttrVal returns the attribute value when an AttributeToken was returned from Next.
 92func (l *Lexer) AttrVal() []byte {
 93	return l.attrVal
 94}
 95
 96// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
 97func (l *Lexer) Next() (TokenType, []byte) {
 98	l.text = nil
 99	var c byte
100	if l.inTag {
101		l.attrVal = nil
102		for { // before attribute name state
103			if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
104				l.r.Move(1)
105				continue
106			}
107			break
108		}
109		if c == 0 && l.r.Err() != nil {
110			return ErrorToken, nil
111		} else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
112			return AttributeToken, l.shiftAttribute()
113		}
114		l.r.Skip()
115		l.inTag = false
116		if c == '/' {
117			l.r.Move(2)
118			return StartTagVoidToken, l.r.Shift()
119		}
120		l.r.Move(1)
121		return StartTagCloseToken, l.r.Shift()
122	}
123
124	if l.rawTag != 0 {
125		if rawText := l.shiftRawText(); len(rawText) > 0 {
126			l.text = rawText
127			l.rawTag = 0
128			return TextToken, rawText
129		}
130		l.rawTag = 0
131	}
132
133	for {
134		c = l.r.Peek(0)
135		if c == '<' {
136			c = l.r.Peek(1)
137			isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
138			if l.r.Pos() > 0 {
139				if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
140					// return currently buffered texttoken so that we can return tag next iteration
141					l.text = l.r.Shift()
142					return TextToken, l.text
143				}
144			} else if isEndTag {
145				l.r.Move(2)
146				// only endtags that are not followed by > or EOF arrive here
147				if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
148					return CommentToken, l.shiftBogusComment()
149				}
150				return EndTagToken, l.shiftEndTag()
151			} else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
152				l.r.Move(1)
153				l.inTag = true
154				return l.shiftStartTag()
155			} else if c == '!' {
156				l.r.Move(2)
157				return l.readMarkup()
158			} else if c == '?' {
159				l.r.Move(1)
160				return CommentToken, l.shiftBogusComment()
161			}
162		} else if c == 0 && l.r.Err() != nil {
163			if l.r.Pos() > 0 {
164				l.text = l.r.Shift()
165				return TextToken, l.text
166			}
167			return ErrorToken, nil
168		}
169		l.r.Move(1)
170	}
171}
172
173////////////////////////////////////////////////////////////////
174
175// The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html
176
177func (l *Lexer) shiftRawText() []byte {
178	if l.rawTag == Plaintext {
179		for {
180			if l.r.Peek(0) == 0 && l.r.Err() != nil {
181				return l.r.Shift()
182			}
183			l.r.Move(1)
184		}
185	} else { // RCDATA, RAWTEXT and SCRIPT
186		for {
187			c := l.r.Peek(0)
188			if c == '<' {
189				if l.r.Peek(1) == '/' {
190					mark := l.r.Pos()
191					l.r.Move(2)
192					for {
193						if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
194							break
195						}
196						l.r.Move(1)
197					}
198					if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
199						l.r.Rewind(mark)
200						return l.r.Shift()
201					}
202				} else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
203					l.r.Move(4)
204					inScript := false
205					for {
206						c := l.r.Peek(0)
207						if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
208							l.r.Move(3)
209							break
210						} else if c == '<' {
211							isEnd := l.r.Peek(1) == '/'
212							if isEnd {
213								l.r.Move(2)
214							} else {
215								l.r.Move(1)
216							}
217							mark := l.r.Pos()
218							for {
219								if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
220									break
221								}
222								l.r.Move(1)
223							}
224							if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
225								if !isEnd {
226									inScript = true
227								} else {
228									if !inScript {
229										l.r.Rewind(mark - 2)
230										return l.r.Shift()
231									}
232									inScript = false
233								}
234							}
235						} else if c == 0 && l.r.Err() != nil {
236							return l.r.Shift()
237						} else {
238							l.r.Move(1)
239						}
240					}
241				} else {
242					l.r.Move(1)
243				}
244			} else if c == 0 && l.r.Err() != nil {
245				return l.r.Shift()
246			} else {
247				l.r.Move(1)
248			}
249		}
250	}
251}
252
253func (l *Lexer) readMarkup() (TokenType, []byte) {
254	if l.at('-', '-') {
255		l.r.Move(2)
256		for {
257			if l.r.Peek(0) == 0 && l.r.Err() != nil {
258				l.text = l.r.Lexeme()[4:]
259				return CommentToken, l.r.Shift()
260			} else if l.at('-', '-', '>') {
261				l.text = l.r.Lexeme()[4:]
262				l.r.Move(3)
263				return CommentToken, l.r.Shift()
264			} else if l.at('-', '-', '!', '>') {
265				l.text = l.r.Lexeme()[4:]
266				l.r.Move(4)
267				return CommentToken, l.r.Shift()
268			}
269			l.r.Move(1)
270		}
271	} else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
272		l.r.Move(7)
273		for {
274			if l.r.Peek(0) == 0 && l.r.Err() != nil {
275				l.text = l.r.Lexeme()[9:]
276				return TextToken, l.r.Shift()
277			} else if l.at(']', ']', '>') {
278				l.text = l.r.Lexeme()[9:]
279				l.r.Move(3)
280				return TextToken, l.r.Shift()
281			}
282			l.r.Move(1)
283		}
284	} else {
285		if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
286			l.r.Move(7)
287			if l.r.Peek(0) == ' ' {
288				l.r.Move(1)
289			}
290			for {
291				if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
292					l.text = l.r.Lexeme()[9:]
293					if c == '>' {
294						l.r.Move(1)
295					}
296					return DoctypeToken, l.r.Shift()
297				}
298				l.r.Move(1)
299			}
300		}
301	}
302	return CommentToken, l.shiftBogusComment()
303}
304
305func (l *Lexer) shiftBogusComment() []byte {
306	for {
307		c := l.r.Peek(0)
308		if c == '>' {
309			l.text = l.r.Lexeme()[2:]
310			l.r.Move(1)
311			return l.r.Shift()
312		} else if c == 0 && l.r.Err() != nil {
313			l.text = l.r.Lexeme()[2:]
314			return l.r.Shift()
315		}
316		l.r.Move(1)
317	}
318}
319
320func (l *Lexer) shiftStartTag() (TokenType, []byte) {
321	for {
322		if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
323			break
324		}
325		l.r.Move(1)
326	}
327	l.text = parse.ToLower(l.r.Lexeme()[1:])
328	if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
329		if h == Svg || h == Math {
330			data := l.shiftXML(h)
331			if l.err != nil {
332				return ErrorToken, nil
333			}
334
335			l.inTag = false
336			if h == Svg {
337				return SvgToken, data
338			}
339			return MathToken, data
340		}
341		l.rawTag = h
342	}
343	return StartTagToken, l.r.Shift()
344}
345
346func (l *Lexer) shiftAttribute() []byte {
347	nameStart := l.r.Pos()
348	var c byte
349	for { // attribute name state
350		if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
351			break
352		}
353		l.r.Move(1)
354	}
355	nameEnd := l.r.Pos()
356	for { // after attribute name state
357		if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
358			l.r.Move(1)
359			continue
360		}
361		break
362	}
363	if c == '=' {
364		l.r.Move(1)
365		for { // before attribute value state
366			if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
367				l.r.Move(1)
368				continue
369			}
370			break
371		}
372		attrPos := l.r.Pos()
373		delim := c
374		if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
375			l.r.Move(1)
376			for {
377				c := l.r.Peek(0)
378				if c == delim {
379					l.r.Move(1)
380					break
381				} else if c == 0 && l.r.Err() != nil {
382					break
383				}
384				l.r.Move(1)
385			}
386		} else { // attribute value unquoted state
387			for {
388				if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
389					break
390				}
391				l.r.Move(1)
392			}
393		}
394		l.attrVal = l.r.Lexeme()[attrPos:]
395	} else {
396		l.r.Rewind(nameEnd)
397		l.attrVal = nil
398	}
399	l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd])
400	return l.r.Shift()
401}
402
403func (l *Lexer) shiftEndTag() []byte {
404	for {
405		c := l.r.Peek(0)
406		if c == '>' {
407			l.text = l.r.Lexeme()[2:]
408			l.r.Move(1)
409			break
410		} else if c == 0 && l.r.Err() != nil {
411			l.text = l.r.Lexeme()[2:]
412			break
413		}
414		l.r.Move(1)
415	}
416
417	end := len(l.text)
418	for end > 0 {
419		if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
420			end--
421			continue
422		}
423		break
424	}
425	l.text = l.text[:end]
426	return parse.ToLower(l.r.Shift())
427}
428
429// shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
430// So far we have already parsed `<svg` or `<math`.
431func (l *Lexer) shiftXML(rawTag Hash) []byte {
432	inQuote := false
433	for {
434		c := l.r.Peek(0)
435		if c == '"' {
436			inQuote = !inQuote
437			l.r.Move(1)
438		} else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
439			mark := l.r.Pos()
440			l.r.Move(2)
441			for {
442				if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
443					break
444				}
445				l.r.Move(1)
446			}
447			if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
448				break
449			}
450		} else if c == 0 {
451			if l.r.Err() == nil {
452				l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
453			}
454			return l.r.Shift()
455		} else {
456			l.r.Move(1)
457		}
458	}
459
460	for {
461		c := l.r.Peek(0)
462		if c == '>' {
463			l.r.Move(1)
464			break
465		} else if c == 0 {
466			if l.r.Err() == nil {
467				l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
468			}
469			return l.r.Shift()
470		}
471		l.r.Move(1)
472	}
473	return l.r.Shift()
474}
475
476////////////////////////////////////////////////////////////////
477
478func (l *Lexer) at(b ...byte) bool {
479	for i, c := range b {
480		if l.r.Peek(i) != c {
481			return false
482		}
483	}
484	return true
485}
486
487func (l *Lexer) atCaseInsensitive(b ...byte) bool {
488	for i, c := range b {
489		if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
490			return false
491		}
492	}
493	return true
494}