diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/v2/html/lex.go')
| -rw-r--r-- | vendor/github.com/tdewolff/parse/v2/html/lex.go | 494 |
1 files changed, 494 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/v2/html/lex.go b/vendor/github.com/tdewolff/parse/v2/html/lex.go new file mode 100644 index 0000000..4325024 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/lex.go | |||
| @@ -0,0 +1,494 @@ | |||
| 1 | // Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html. | ||
| 2 | package html | ||
| 3 | |||
| 4 | import ( | ||
| 5 | "strconv" | ||
| 6 | |||
| 7 | "github.com/tdewolff/parse/v2" | ||
| 8 | ) | ||
| 9 | |||
| 10 | // TokenType determines the type of token, eg. a number or a semicolon. | ||
| 11 | type TokenType uint32 | ||
| 12 | |||
| 13 | // TokenType values. | ||
| 14 | const ( | ||
| 15 | ErrorToken TokenType = iota // extra token when errors occur | ||
| 16 | CommentToken | ||
| 17 | DoctypeToken | ||
| 18 | StartTagToken | ||
| 19 | StartTagCloseToken | ||
| 20 | StartTagVoidToken | ||
| 21 | EndTagToken | ||
| 22 | AttributeToken | ||
| 23 | TextToken | ||
| 24 | SvgToken | ||
| 25 | MathToken | ||
| 26 | ) | ||
| 27 | |||
| 28 | // String returns the string representation of a TokenType. | ||
| 29 | func (tt TokenType) String() string { | ||
| 30 | switch tt { | ||
| 31 | case ErrorToken: | ||
| 32 | return "Error" | ||
| 33 | case CommentToken: | ||
| 34 | return "Comment" | ||
| 35 | case DoctypeToken: | ||
| 36 | return "Doctype" | ||
| 37 | case StartTagToken: | ||
| 38 | return "StartTag" | ||
| 39 | case StartTagCloseToken: | ||
| 40 | return "StartTagClose" | ||
| 41 | case StartTagVoidToken: | ||
| 42 | return "StartTagVoid" | ||
| 43 | case EndTagToken: | ||
| 44 | return "EndTag" | ||
| 45 | case AttributeToken: | ||
| 46 | return "Attribute" | ||
| 47 | case TextToken: | ||
| 48 | return "Text" | ||
| 49 | case SvgToken: | ||
| 50 | return "Svg" | ||
| 51 | case MathToken: | ||
| 52 | return "Math" | ||
| 53 | } | ||
| 54 | return "Invalid(" + strconv.Itoa(int(tt)) + ")" | ||
| 55 | } | ||
| 56 | |||
| 57 | //////////////////////////////////////////////////////////////// | ||
| 58 | |||
| 59 | // Lexer is the state for the lexer. | ||
| 60 | type Lexer struct { | ||
| 61 | r *parse.Input | ||
| 62 | err error | ||
| 63 | |||
| 64 | rawTag Hash | ||
| 65 | inTag bool | ||
| 66 | |||
| 67 | text []byte | ||
| 68 | attrVal []byte | ||
| 69 | } | ||
| 70 | |||
| 71 | // NewLexer returns a new Lexer for a given io.Reader. | ||
| 72 | func NewLexer(r *parse.Input) *Lexer { | ||
| 73 | return &Lexer{ | ||
| 74 | r: r, | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. | ||
| 79 | func (l *Lexer) Err() error { | ||
| 80 | if l.err != nil { | ||
| 81 | return l.err | ||
| 82 | } | ||
| 83 | return l.r.Err() | ||
| 84 | } | ||
| 85 | |||
| 86 | // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. | ||
| 87 | func (l *Lexer) Text() []byte { | ||
| 88 | return l.text | ||
| 89 | } | ||
| 90 | |||
| 91 | // AttrVal returns the attribute value when an AttributeToken was returned from Next. | ||
| 92 | func (l *Lexer) AttrVal() []byte { | ||
| 93 | return l.attrVal | ||
| 94 | } | ||
| 95 | |||
| 96 | // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. | ||
| 97 | func (l *Lexer) Next() (TokenType, []byte) { | ||
| 98 | l.text = nil | ||
| 99 | var c byte | ||
| 100 | if l.inTag { | ||
| 101 | l.attrVal = nil | ||
| 102 | for { // before attribute name state | ||
| 103 | if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { | ||
| 104 | l.r.Move(1) | ||
| 105 | continue | ||
| 106 | } | ||
| 107 | break | ||
| 108 | } | ||
| 109 | if c == 0 && l.r.Err() != nil { | ||
| 110 | return ErrorToken, nil | ||
| 111 | } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') { | ||
| 112 | return AttributeToken, l.shiftAttribute() | ||
| 113 | } | ||
| 114 | l.r.Skip() | ||
| 115 | l.inTag = false | ||
| 116 | if c == '/' { | ||
| 117 | l.r.Move(2) | ||
| 118 | return StartTagVoidToken, l.r.Shift() | ||
| 119 | } | ||
| 120 | l.r.Move(1) | ||
| 121 | return StartTagCloseToken, l.r.Shift() | ||
| 122 | } | ||
| 123 | |||
| 124 | if l.rawTag != 0 { | ||
| 125 | if rawText := l.shiftRawText(); len(rawText) > 0 { | ||
| 126 | l.text = rawText | ||
| 127 | l.rawTag = 0 | ||
| 128 | return TextToken, rawText | ||
| 129 | } | ||
| 130 | l.rawTag = 0 | ||
| 131 | } | ||
| 132 | |||
| 133 | for { | ||
| 134 | c = l.r.Peek(0) | ||
| 135 | if c == '<' { | ||
| 136 | c = l.r.Peek(1) | ||
| 137 | isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil) | ||
| 138 | if l.r.Pos() > 0 { | ||
| 139 | if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' { | ||
| 140 | // return currently buffered texttoken so that we can return tag next iteration | ||
| 141 | l.text = l.r.Shift() | ||
| 142 | return TextToken, l.text | ||
| 143 | } | ||
| 144 | } else if isEndTag { | ||
| 145 | l.r.Move(2) | ||
| 146 | // only endtags that are not followed by > or EOF arrive here | ||
| 147 | if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { | ||
| 148 | return CommentToken, l.shiftBogusComment() | ||
| 149 | } | ||
| 150 | return EndTagToken, l.shiftEndTag() | ||
| 151 | } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { | ||
| 152 | l.r.Move(1) | ||
| 153 | l.inTag = true | ||
| 154 | return l.shiftStartTag() | ||
| 155 | } else if c == '!' { | ||
| 156 | l.r.Move(2) | ||
| 157 | return l.readMarkup() | ||
| 158 | } else if c == '?' { | ||
| 159 | l.r.Move(1) | ||
| 160 | return CommentToken, l.shiftBogusComment() | ||
| 161 | } | ||
| 162 | } else if c == 0 && l.r.Err() != nil { | ||
| 163 | if l.r.Pos() > 0 { | ||
| 164 | l.text = l.r.Shift() | ||
| 165 | return TextToken, l.text | ||
| 166 | } | ||
| 167 | return ErrorToken, nil | ||
| 168 | } | ||
| 169 | l.r.Move(1) | ||
| 170 | } | ||
| 171 | } | ||
| 172 | |||
| 173 | //////////////////////////////////////////////////////////////// | ||
| 174 | |||
| 175 | // The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html | ||
| 176 | |||
| 177 | func (l *Lexer) shiftRawText() []byte { | ||
| 178 | if l.rawTag == Plaintext { | ||
| 179 | for { | ||
| 180 | if l.r.Peek(0) == 0 && l.r.Err() != nil { | ||
| 181 | return l.r.Shift() | ||
| 182 | } | ||
| 183 | l.r.Move(1) | ||
| 184 | } | ||
| 185 | } else { // RCDATA, RAWTEXT and SCRIPT | ||
| 186 | for { | ||
| 187 | c := l.r.Peek(0) | ||
| 188 | if c == '<' { | ||
| 189 | if l.r.Peek(1) == '/' { | ||
| 190 | mark := l.r.Pos() | ||
| 191 | l.r.Move(2) | ||
| 192 | for { | ||
| 193 | if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { | ||
| 194 | break | ||
| 195 | } | ||
| 196 | l.r.Move(1) | ||
| 197 | } | ||
| 198 | if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice | ||
| 199 | l.r.Rewind(mark) | ||
| 200 | return l.r.Shift() | ||
| 201 | } | ||
| 202 | } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { | ||
| 203 | l.r.Move(4) | ||
| 204 | inScript := false | ||
| 205 | for { | ||
| 206 | c := l.r.Peek(0) | ||
| 207 | if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { | ||
| 208 | l.r.Move(3) | ||
| 209 | break | ||
| 210 | } else if c == '<' { | ||
| 211 | isEnd := l.r.Peek(1) == '/' | ||
| 212 | if isEnd { | ||
| 213 | l.r.Move(2) | ||
| 214 | } else { | ||
| 215 | l.r.Move(1) | ||
| 216 | } | ||
| 217 | mark := l.r.Pos() | ||
| 218 | for { | ||
| 219 | if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { | ||
| 220 | break | ||
| 221 | } | ||
| 222 | l.r.Move(1) | ||
| 223 | } | ||
| 224 | if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice | ||
| 225 | if !isEnd { | ||
| 226 | inScript = true | ||
| 227 | } else { | ||
| 228 | if !inScript { | ||
| 229 | l.r.Rewind(mark - 2) | ||
| 230 | return l.r.Shift() | ||
| 231 | } | ||
| 232 | inScript = false | ||
| 233 | } | ||
| 234 | } | ||
| 235 | } else if c == 0 && l.r.Err() != nil { | ||
| 236 | return l.r.Shift() | ||
| 237 | } else { | ||
| 238 | l.r.Move(1) | ||
| 239 | } | ||
| 240 | } | ||
| 241 | } else { | ||
| 242 | l.r.Move(1) | ||
| 243 | } | ||
| 244 | } else if c == 0 && l.r.Err() != nil { | ||
| 245 | return l.r.Shift() | ||
| 246 | } else { | ||
| 247 | l.r.Move(1) | ||
| 248 | } | ||
| 249 | } | ||
| 250 | } | ||
| 251 | } | ||
| 252 | |||
| 253 | func (l *Lexer) readMarkup() (TokenType, []byte) { | ||
| 254 | if l.at('-', '-') { | ||
| 255 | l.r.Move(2) | ||
| 256 | for { | ||
| 257 | if l.r.Peek(0) == 0 && l.r.Err() != nil { | ||
| 258 | l.text = l.r.Lexeme()[4:] | ||
| 259 | return CommentToken, l.r.Shift() | ||
| 260 | } else if l.at('-', '-', '>') { | ||
| 261 | l.text = l.r.Lexeme()[4:] | ||
| 262 | l.r.Move(3) | ||
| 263 | return CommentToken, l.r.Shift() | ||
| 264 | } else if l.at('-', '-', '!', '>') { | ||
| 265 | l.text = l.r.Lexeme()[4:] | ||
| 266 | l.r.Move(4) | ||
| 267 | return CommentToken, l.r.Shift() | ||
| 268 | } | ||
| 269 | l.r.Move(1) | ||
| 270 | } | ||
| 271 | } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') { | ||
| 272 | l.r.Move(7) | ||
| 273 | for { | ||
| 274 | if l.r.Peek(0) == 0 && l.r.Err() != nil { | ||
| 275 | l.text = l.r.Lexeme()[9:] | ||
| 276 | return TextToken, l.r.Shift() | ||
| 277 | } else if l.at(']', ']', '>') { | ||
| 278 | l.text = l.r.Lexeme()[9:] | ||
| 279 | l.r.Move(3) | ||
| 280 | return TextToken, l.r.Shift() | ||
| 281 | } | ||
| 282 | l.r.Move(1) | ||
| 283 | } | ||
| 284 | } else { | ||
| 285 | if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') { | ||
| 286 | l.r.Move(7) | ||
| 287 | if l.r.Peek(0) == ' ' { | ||
| 288 | l.r.Move(1) | ||
| 289 | } | ||
| 290 | for { | ||
| 291 | if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil { | ||
| 292 | l.text = l.r.Lexeme()[9:] | ||
| 293 | if c == '>' { | ||
| 294 | l.r.Move(1) | ||
| 295 | } | ||
| 296 | return DoctypeToken, l.r.Shift() | ||
| 297 | } | ||
| 298 | l.r.Move(1) | ||
| 299 | } | ||
| 300 | } | ||
| 301 | } | ||
| 302 | return CommentToken, l.shiftBogusComment() | ||
| 303 | } | ||
| 304 | |||
| 305 | func (l *Lexer) shiftBogusComment() []byte { | ||
| 306 | for { | ||
| 307 | c := l.r.Peek(0) | ||
| 308 | if c == '>' { | ||
| 309 | l.text = l.r.Lexeme()[2:] | ||
| 310 | l.r.Move(1) | ||
| 311 | return l.r.Shift() | ||
| 312 | } else if c == 0 && l.r.Err() != nil { | ||
| 313 | l.text = l.r.Lexeme()[2:] | ||
| 314 | return l.r.Shift() | ||
| 315 | } | ||
| 316 | l.r.Move(1) | ||
| 317 | } | ||
| 318 | } | ||
| 319 | |||
| 320 | func (l *Lexer) shiftStartTag() (TokenType, []byte) { | ||
| 321 | for { | ||
| 322 | if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { | ||
| 323 | break | ||
| 324 | } | ||
| 325 | l.r.Move(1) | ||
| 326 | } | ||
| 327 | l.text = parse.ToLower(l.r.Lexeme()[1:]) | ||
| 328 | if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math { | ||
| 329 | if h == Svg || h == Math { | ||
| 330 | data := l.shiftXML(h) | ||
| 331 | if l.err != nil { | ||
| 332 | return ErrorToken, nil | ||
| 333 | } | ||
| 334 | |||
| 335 | l.inTag = false | ||
| 336 | if h == Svg { | ||
| 337 | return SvgToken, data | ||
| 338 | } | ||
| 339 | return MathToken, data | ||
| 340 | } | ||
| 341 | l.rawTag = h | ||
| 342 | } | ||
| 343 | return StartTagToken, l.r.Shift() | ||
| 344 | } | ||
| 345 | |||
| 346 | func (l *Lexer) shiftAttribute() []byte { | ||
| 347 | nameStart := l.r.Pos() | ||
| 348 | var c byte | ||
| 349 | for { // attribute name state | ||
| 350 | if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { | ||
| 351 | break | ||
| 352 | } | ||
| 353 | l.r.Move(1) | ||
| 354 | } | ||
| 355 | nameEnd := l.r.Pos() | ||
| 356 | for { // after attribute name state | ||
| 357 | if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { | ||
| 358 | l.r.Move(1) | ||
| 359 | continue | ||
| 360 | } | ||
| 361 | break | ||
| 362 | } | ||
| 363 | if c == '=' { | ||
| 364 | l.r.Move(1) | ||
| 365 | for { // before attribute value state | ||
| 366 | if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { | ||
| 367 | l.r.Move(1) | ||
| 368 | continue | ||
| 369 | } | ||
| 370 | break | ||
| 371 | } | ||
| 372 | attrPos := l.r.Pos() | ||
| 373 | delim := c | ||
| 374 | if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state | ||
| 375 | l.r.Move(1) | ||
| 376 | for { | ||
| 377 | c := l.r.Peek(0) | ||
| 378 | if c == delim { | ||
| 379 | l.r.Move(1) | ||
| 380 | break | ||
| 381 | } else if c == 0 && l.r.Err() != nil { | ||
| 382 | break | ||
| 383 | } | ||
| 384 | l.r.Move(1) | ||
| 385 | } | ||
| 386 | } else { // attribute value unquoted state | ||
| 387 | for { | ||
| 388 | if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { | ||
| 389 | break | ||
| 390 | } | ||
| 391 | l.r.Move(1) | ||
| 392 | } | ||
| 393 | } | ||
| 394 | l.attrVal = l.r.Lexeme()[attrPos:] | ||
| 395 | } else { | ||
| 396 | l.r.Rewind(nameEnd) | ||
| 397 | l.attrVal = nil | ||
| 398 | } | ||
| 399 | l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd]) | ||
| 400 | return l.r.Shift() | ||
| 401 | } | ||
| 402 | |||
| 403 | func (l *Lexer) shiftEndTag() []byte { | ||
| 404 | for { | ||
| 405 | c := l.r.Peek(0) | ||
| 406 | if c == '>' { | ||
| 407 | l.text = l.r.Lexeme()[2:] | ||
| 408 | l.r.Move(1) | ||
| 409 | break | ||
| 410 | } else if c == 0 && l.r.Err() != nil { | ||
| 411 | l.text = l.r.Lexeme()[2:] | ||
| 412 | break | ||
| 413 | } | ||
| 414 | l.r.Move(1) | ||
| 415 | } | ||
| 416 | |||
| 417 | end := len(l.text) | ||
| 418 | for end > 0 { | ||
| 419 | if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' { | ||
| 420 | end-- | ||
| 421 | continue | ||
| 422 | } | ||
| 423 | break | ||
| 424 | } | ||
| 425 | l.text = l.text[:end] | ||
| 426 | return parse.ToLower(l.r.Shift()) | ||
| 427 | } | ||
| 428 | |||
| 429 | // shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself. | ||
| 430 | // So far we have already parsed `<svg` or `<math`. | ||
| 431 | func (l *Lexer) shiftXML(rawTag Hash) []byte { | ||
| 432 | inQuote := false | ||
| 433 | for { | ||
| 434 | c := l.r.Peek(0) | ||
| 435 | if c == '"' { | ||
| 436 | inQuote = !inQuote | ||
| 437 | l.r.Move(1) | ||
| 438 | } else if c == '<' && !inQuote && l.r.Peek(1) == '/' { | ||
| 439 | mark := l.r.Pos() | ||
| 440 | l.r.Move(2) | ||
| 441 | for { | ||
| 442 | if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { | ||
| 443 | break | ||
| 444 | } | ||
| 445 | l.r.Move(1) | ||
| 446 | } | ||
| 447 | if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice | ||
| 448 | break | ||
| 449 | } | ||
| 450 | } else if c == 0 { | ||
| 451 | if l.r.Err() == nil { | ||
| 452 | l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") | ||
| 453 | } | ||
| 454 | return l.r.Shift() | ||
| 455 | } else { | ||
| 456 | l.r.Move(1) | ||
| 457 | } | ||
| 458 | } | ||
| 459 | |||
| 460 | for { | ||
| 461 | c := l.r.Peek(0) | ||
| 462 | if c == '>' { | ||
| 463 | l.r.Move(1) | ||
| 464 | break | ||
| 465 | } else if c == 0 { | ||
| 466 | if l.r.Err() == nil { | ||
| 467 | l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") | ||
| 468 | } | ||
| 469 | return l.r.Shift() | ||
| 470 | } | ||
| 471 | l.r.Move(1) | ||
| 472 | } | ||
| 473 | return l.r.Shift() | ||
| 474 | } | ||
| 475 | |||
| 476 | //////////////////////////////////////////////////////////////// | ||
| 477 | |||
| 478 | func (l *Lexer) at(b ...byte) bool { | ||
| 479 | for i, c := range b { | ||
| 480 | if l.r.Peek(i) != c { | ||
| 481 | return false | ||
| 482 | } | ||
| 483 | } | ||
| 484 | return true | ||
| 485 | } | ||
| 486 | |||
| 487 | func (l *Lexer) atCaseInsensitive(b ...byte) bool { | ||
| 488 | for i, c := range b { | ||
| 489 | if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c { | ||
| 490 | return false | ||
| 491 | } | ||
| 492 | } | ||
| 493 | return true | ||
| 494 | } | ||
