aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/tdewolff/parse/v2/html/lex.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/tdewolff/parse/v2/html/lex.go')
-rw-r--r--vendor/github.com/tdewolff/parse/v2/html/lex.go494
1 files changed, 494 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/v2/html/lex.go b/vendor/github.com/tdewolff/parse/v2/html/lex.go
new file mode 100644
index 0000000..4325024
--- /dev/null
+++ b/vendor/github.com/tdewolff/parse/v2/html/lex.go
@@ -0,0 +1,494 @@
1// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
2package html
3
4import (
5 "strconv"
6
7 "github.com/tdewolff/parse/v2"
8)
9
10// TokenType determines the type of token, eg. a number or a semicolon.
11type TokenType uint32
12
13// TokenType values.
14const (
15 ErrorToken TokenType = iota // extra token when errors occur
16 CommentToken
17 DoctypeToken
18 StartTagToken
19 StartTagCloseToken
20 StartTagVoidToken
21 EndTagToken
22 AttributeToken
23 TextToken
24 SvgToken
25 MathToken
26)
27
28// String returns the string representation of a TokenType.
29func (tt TokenType) String() string {
30 switch tt {
31 case ErrorToken:
32 return "Error"
33 case CommentToken:
34 return "Comment"
35 case DoctypeToken:
36 return "Doctype"
37 case StartTagToken:
38 return "StartTag"
39 case StartTagCloseToken:
40 return "StartTagClose"
41 case StartTagVoidToken:
42 return "StartTagVoid"
43 case EndTagToken:
44 return "EndTag"
45 case AttributeToken:
46 return "Attribute"
47 case TextToken:
48 return "Text"
49 case SvgToken:
50 return "Svg"
51 case MathToken:
52 return "Math"
53 }
54 return "Invalid(" + strconv.Itoa(int(tt)) + ")"
55}
56
57////////////////////////////////////////////////////////////////
58
59// Lexer is the state for the lexer.
60type Lexer struct {
61 r *parse.Input
62 err error
63
64 rawTag Hash
65 inTag bool
66
67 text []byte
68 attrVal []byte
69}
70
71// NewLexer returns a new Lexer for a given io.Reader.
72func NewLexer(r *parse.Input) *Lexer {
73 return &Lexer{
74 r: r,
75 }
76}
77
78// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
79func (l *Lexer) Err() error {
80 if l.err != nil {
81 return l.err
82 }
83 return l.r.Err()
84}
85
86// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
87func (l *Lexer) Text() []byte {
88 return l.text
89}
90
91// AttrVal returns the attribute value when an AttributeToken was returned from Next.
92func (l *Lexer) AttrVal() []byte {
93 return l.attrVal
94}
95
96// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
97func (l *Lexer) Next() (TokenType, []byte) {
98 l.text = nil
99 var c byte
100 if l.inTag {
101 l.attrVal = nil
102 for { // before attribute name state
103 if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
104 l.r.Move(1)
105 continue
106 }
107 break
108 }
109 if c == 0 && l.r.Err() != nil {
110 return ErrorToken, nil
111 } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
112 return AttributeToken, l.shiftAttribute()
113 }
114 l.r.Skip()
115 l.inTag = false
116 if c == '/' {
117 l.r.Move(2)
118 return StartTagVoidToken, l.r.Shift()
119 }
120 l.r.Move(1)
121 return StartTagCloseToken, l.r.Shift()
122 }
123
124 if l.rawTag != 0 {
125 if rawText := l.shiftRawText(); len(rawText) > 0 {
126 l.text = rawText
127 l.rawTag = 0
128 return TextToken, rawText
129 }
130 l.rawTag = 0
131 }
132
133 for {
134 c = l.r.Peek(0)
135 if c == '<' {
136 c = l.r.Peek(1)
137 isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
138 if l.r.Pos() > 0 {
139 if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
140 // return currently buffered texttoken so that we can return tag next iteration
141 l.text = l.r.Shift()
142 return TextToken, l.text
143 }
144 } else if isEndTag {
145 l.r.Move(2)
146 // only endtags that are not followed by > or EOF arrive here
147 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
148 return CommentToken, l.shiftBogusComment()
149 }
150 return EndTagToken, l.shiftEndTag()
151 } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
152 l.r.Move(1)
153 l.inTag = true
154 return l.shiftStartTag()
155 } else if c == '!' {
156 l.r.Move(2)
157 return l.readMarkup()
158 } else if c == '?' {
159 l.r.Move(1)
160 return CommentToken, l.shiftBogusComment()
161 }
162 } else if c == 0 && l.r.Err() != nil {
163 if l.r.Pos() > 0 {
164 l.text = l.r.Shift()
165 return TextToken, l.text
166 }
167 return ErrorToken, nil
168 }
169 l.r.Move(1)
170 }
171}
172
173////////////////////////////////////////////////////////////////
174
175// The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html
176
177func (l *Lexer) shiftRawText() []byte {
178 if l.rawTag == Plaintext {
179 for {
180 if l.r.Peek(0) == 0 && l.r.Err() != nil {
181 return l.r.Shift()
182 }
183 l.r.Move(1)
184 }
185 } else { // RCDATA, RAWTEXT and SCRIPT
186 for {
187 c := l.r.Peek(0)
188 if c == '<' {
189 if l.r.Peek(1) == '/' {
190 mark := l.r.Pos()
191 l.r.Move(2)
192 for {
193 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
194 break
195 }
196 l.r.Move(1)
197 }
198 if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
199 l.r.Rewind(mark)
200 return l.r.Shift()
201 }
202 } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
203 l.r.Move(4)
204 inScript := false
205 for {
206 c := l.r.Peek(0)
207 if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
208 l.r.Move(3)
209 break
210 } else if c == '<' {
211 isEnd := l.r.Peek(1) == '/'
212 if isEnd {
213 l.r.Move(2)
214 } else {
215 l.r.Move(1)
216 }
217 mark := l.r.Pos()
218 for {
219 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
220 break
221 }
222 l.r.Move(1)
223 }
224 if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
225 if !isEnd {
226 inScript = true
227 } else {
228 if !inScript {
229 l.r.Rewind(mark - 2)
230 return l.r.Shift()
231 }
232 inScript = false
233 }
234 }
235 } else if c == 0 && l.r.Err() != nil {
236 return l.r.Shift()
237 } else {
238 l.r.Move(1)
239 }
240 }
241 } else {
242 l.r.Move(1)
243 }
244 } else if c == 0 && l.r.Err() != nil {
245 return l.r.Shift()
246 } else {
247 l.r.Move(1)
248 }
249 }
250 }
251}
252
253func (l *Lexer) readMarkup() (TokenType, []byte) {
254 if l.at('-', '-') {
255 l.r.Move(2)
256 for {
257 if l.r.Peek(0) == 0 && l.r.Err() != nil {
258 l.text = l.r.Lexeme()[4:]
259 return CommentToken, l.r.Shift()
260 } else if l.at('-', '-', '>') {
261 l.text = l.r.Lexeme()[4:]
262 l.r.Move(3)
263 return CommentToken, l.r.Shift()
264 } else if l.at('-', '-', '!', '>') {
265 l.text = l.r.Lexeme()[4:]
266 l.r.Move(4)
267 return CommentToken, l.r.Shift()
268 }
269 l.r.Move(1)
270 }
271 } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
272 l.r.Move(7)
273 for {
274 if l.r.Peek(0) == 0 && l.r.Err() != nil {
275 l.text = l.r.Lexeme()[9:]
276 return TextToken, l.r.Shift()
277 } else if l.at(']', ']', '>') {
278 l.text = l.r.Lexeme()[9:]
279 l.r.Move(3)
280 return TextToken, l.r.Shift()
281 }
282 l.r.Move(1)
283 }
284 } else {
285 if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
286 l.r.Move(7)
287 if l.r.Peek(0) == ' ' {
288 l.r.Move(1)
289 }
290 for {
291 if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
292 l.text = l.r.Lexeme()[9:]
293 if c == '>' {
294 l.r.Move(1)
295 }
296 return DoctypeToken, l.r.Shift()
297 }
298 l.r.Move(1)
299 }
300 }
301 }
302 return CommentToken, l.shiftBogusComment()
303}
304
305func (l *Lexer) shiftBogusComment() []byte {
306 for {
307 c := l.r.Peek(0)
308 if c == '>' {
309 l.text = l.r.Lexeme()[2:]
310 l.r.Move(1)
311 return l.r.Shift()
312 } else if c == 0 && l.r.Err() != nil {
313 l.text = l.r.Lexeme()[2:]
314 return l.r.Shift()
315 }
316 l.r.Move(1)
317 }
318}
319
320func (l *Lexer) shiftStartTag() (TokenType, []byte) {
321 for {
322 if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
323 break
324 }
325 l.r.Move(1)
326 }
327 l.text = parse.ToLower(l.r.Lexeme()[1:])
328 if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
329 if h == Svg || h == Math {
330 data := l.shiftXML(h)
331 if l.err != nil {
332 return ErrorToken, nil
333 }
334
335 l.inTag = false
336 if h == Svg {
337 return SvgToken, data
338 }
339 return MathToken, data
340 }
341 l.rawTag = h
342 }
343 return StartTagToken, l.r.Shift()
344}
345
346func (l *Lexer) shiftAttribute() []byte {
347 nameStart := l.r.Pos()
348 var c byte
349 for { // attribute name state
350 if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
351 break
352 }
353 l.r.Move(1)
354 }
355 nameEnd := l.r.Pos()
356 for { // after attribute name state
357 if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
358 l.r.Move(1)
359 continue
360 }
361 break
362 }
363 if c == '=' {
364 l.r.Move(1)
365 for { // before attribute value state
366 if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
367 l.r.Move(1)
368 continue
369 }
370 break
371 }
372 attrPos := l.r.Pos()
373 delim := c
374 if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
375 l.r.Move(1)
376 for {
377 c := l.r.Peek(0)
378 if c == delim {
379 l.r.Move(1)
380 break
381 } else if c == 0 && l.r.Err() != nil {
382 break
383 }
384 l.r.Move(1)
385 }
386 } else { // attribute value unquoted state
387 for {
388 if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
389 break
390 }
391 l.r.Move(1)
392 }
393 }
394 l.attrVal = l.r.Lexeme()[attrPos:]
395 } else {
396 l.r.Rewind(nameEnd)
397 l.attrVal = nil
398 }
399 l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd])
400 return l.r.Shift()
401}
402
403func (l *Lexer) shiftEndTag() []byte {
404 for {
405 c := l.r.Peek(0)
406 if c == '>' {
407 l.text = l.r.Lexeme()[2:]
408 l.r.Move(1)
409 break
410 } else if c == 0 && l.r.Err() != nil {
411 l.text = l.r.Lexeme()[2:]
412 break
413 }
414 l.r.Move(1)
415 }
416
417 end := len(l.text)
418 for end > 0 {
419 if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
420 end--
421 continue
422 }
423 break
424 }
425 l.text = l.text[:end]
426 return parse.ToLower(l.r.Shift())
427}
428
429// shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
430// So far we have already parsed `<svg` or `<math`.
431func (l *Lexer) shiftXML(rawTag Hash) []byte {
432 inQuote := false
433 for {
434 c := l.r.Peek(0)
435 if c == '"' {
436 inQuote = !inQuote
437 l.r.Move(1)
438 } else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
439 mark := l.r.Pos()
440 l.r.Move(2)
441 for {
442 if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
443 break
444 }
445 l.r.Move(1)
446 }
447 if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
448 break
449 }
450 } else if c == 0 {
451 if l.r.Err() == nil {
452 l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
453 }
454 return l.r.Shift()
455 } else {
456 l.r.Move(1)
457 }
458 }
459
460 for {
461 c := l.r.Peek(0)
462 if c == '>' {
463 l.r.Move(1)
464 break
465 } else if c == 0 {
466 if l.r.Err() == nil {
467 l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character")
468 }
469 return l.r.Shift()
470 }
471 l.r.Move(1)
472 }
473 return l.r.Shift()
474}
475
476////////////////////////////////////////////////////////////////
477
478func (l *Lexer) at(b ...byte) bool {
479 for i, c := range b {
480 if l.r.Peek(i) != c {
481 return false
482 }
483 }
484 return true
485}
486
487func (l *Lexer) atCaseInsensitive(b ...byte) bool {
488 for i, c := range b {
489 if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
490 return false
491 }
492 }
493 return true
494}