1// Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
2package css
3
4// TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
5
6import (
7 "bytes"
8 "io"
9 "strconv"
10
11 "github.com/tdewolff/parse/v2"
12)
13
14// TokenType determines the type of token, eg. a number or a semicolon.
15type TokenType uint32
16
17// TokenType values.
18const (
19 ErrorToken TokenType = iota // extra token when errors occur
20 IdentToken
21 FunctionToken // rgb( rgba( ...
22 AtKeywordToken // @abc
23 HashToken // #abc
24 StringToken
25 BadStringToken
26 URLToken
27 BadURLToken
28 DelimToken // any unmatched character
29 NumberToken // 5
30 PercentageToken // 5%
31 DimensionToken // 5em
32 UnicodeRangeToken // U+554A
33 IncludeMatchToken // ~=
34 DashMatchToken // |=
35 PrefixMatchToken // ^=
36 SuffixMatchToken // $=
37 SubstringMatchToken // *=
38 ColumnToken // ||
39 WhitespaceToken // space \t \r \n \f
40 CDOToken // <!--
41 CDCToken // -->
42 ColonToken // :
43 SemicolonToken // ;
44 CommaToken // ,
45 LeftBracketToken // [
46 RightBracketToken // ]
47 LeftParenthesisToken // (
48 RightParenthesisToken // )
49 LeftBraceToken // {
50 RightBraceToken // }
51 CommentToken // extra token for comments
52 EmptyToken
53 CustomPropertyNameToken
54 CustomPropertyValueToken
55)
56
57// String returns the string representation of a TokenType.
58func (tt TokenType) String() string {
59 switch tt {
60 case ErrorToken:
61 return "Error"
62 case IdentToken:
63 return "Ident"
64 case FunctionToken:
65 return "Function"
66 case AtKeywordToken:
67 return "AtKeyword"
68 case HashToken:
69 return "Hash"
70 case StringToken:
71 return "String"
72 case BadStringToken:
73 return "BadString"
74 case URLToken:
75 return "URL"
76 case BadURLToken:
77 return "BadURL"
78 case DelimToken:
79 return "Delim"
80 case NumberToken:
81 return "Number"
82 case PercentageToken:
83 return "Percentage"
84 case DimensionToken:
85 return "Dimension"
86 case UnicodeRangeToken:
87 return "UnicodeRange"
88 case IncludeMatchToken:
89 return "IncludeMatch"
90 case DashMatchToken:
91 return "DashMatch"
92 case PrefixMatchToken:
93 return "PrefixMatch"
94 case SuffixMatchToken:
95 return "SuffixMatch"
96 case SubstringMatchToken:
97 return "SubstringMatch"
98 case ColumnToken:
99 return "Column"
100 case WhitespaceToken:
101 return "Whitespace"
102 case CDOToken:
103 return "CDO"
104 case CDCToken:
105 return "CDC"
106 case ColonToken:
107 return "Colon"
108 case SemicolonToken:
109 return "Semicolon"
110 case CommaToken:
111 return "Comma"
112 case LeftBracketToken:
113 return "LeftBracket"
114 case RightBracketToken:
115 return "RightBracket"
116 case LeftParenthesisToken:
117 return "LeftParenthesis"
118 case RightParenthesisToken:
119 return "RightParenthesis"
120 case LeftBraceToken:
121 return "LeftBrace"
122 case RightBraceToken:
123 return "RightBrace"
124 case CommentToken:
125 return "Comment"
126 case EmptyToken:
127 return "Empty"
128 case CustomPropertyNameToken:
129 return "CustomPropertyName"
130 case CustomPropertyValueToken:
131 return "CustomPropertyValue"
132 }
133 return "Invalid(" + strconv.Itoa(int(tt)) + ")"
134}
135
136////////////////////////////////////////////////////////////////
137
138// Lexer is the state for the lexer.
139type Lexer struct {
140 r *parse.Input
141}
142
143// NewLexer returns a new Lexer for a given io.Reader.
144func NewLexer(r *parse.Input) *Lexer {
145 return &Lexer{
146 r: r,
147 }
148}
149
150// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
151func (l *Lexer) Err() error {
152 return l.r.Err()
153}
154
155// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
156func (l *Lexer) Next() (TokenType, []byte) {
157 switch l.r.Peek(0) {
158 case ' ', '\t', '\n', '\r', '\f':
159 l.r.Move(1)
160 for l.consumeWhitespace() {
161 }
162 return WhitespaceToken, l.r.Shift()
163 case ':':
164 l.r.Move(1)
165 return ColonToken, l.r.Shift()
166 case ';':
167 l.r.Move(1)
168 return SemicolonToken, l.r.Shift()
169 case ',':
170 l.r.Move(1)
171 return CommaToken, l.r.Shift()
172 case '(', ')', '[', ']', '{', '}':
173 if t := l.consumeBracket(); t != ErrorToken {
174 return t, l.r.Shift()
175 }
176 case '#':
177 if l.consumeHashToken() {
178 return HashToken, l.r.Shift()
179 }
180 case '"', '\'':
181 if t := l.consumeString(); t != ErrorToken {
182 return t, l.r.Shift()
183 }
184 case '.', '+':
185 if t := l.consumeNumeric(); t != ErrorToken {
186 return t, l.r.Shift()
187 }
188 case '-':
189 if t := l.consumeNumeric(); t != ErrorToken {
190 return t, l.r.Shift()
191 } else if t := l.consumeIdentlike(); t != ErrorToken {
192 return t, l.r.Shift()
193 } else if l.consumeCDCToken() {
194 return CDCToken, l.r.Shift()
195 } else if l.consumeCustomVariableToken() {
196 return CustomPropertyNameToken, l.r.Shift()
197 }
198 case '@':
199 if l.consumeAtKeywordToken() {
200 return AtKeywordToken, l.r.Shift()
201 }
202 case '$', '*', '^', '~':
203 if t := l.consumeMatch(); t != ErrorToken {
204 return t, l.r.Shift()
205 }
206 case '/':
207 if l.consumeComment() {
208 return CommentToken, l.r.Shift()
209 }
210 case '<':
211 if l.consumeCDOToken() {
212 return CDOToken, l.r.Shift()
213 }
214 case '\\':
215 if t := l.consumeIdentlike(); t != ErrorToken {
216 return t, l.r.Shift()
217 }
218 case 'u', 'U':
219 if l.consumeUnicodeRangeToken() {
220 return UnicodeRangeToken, l.r.Shift()
221 } else if t := l.consumeIdentlike(); t != ErrorToken {
222 return t, l.r.Shift()
223 }
224 case '|':
225 if t := l.consumeMatch(); t != ErrorToken {
226 return t, l.r.Shift()
227 } else if l.consumeColumnToken() {
228 return ColumnToken, l.r.Shift()
229 }
230 case 0:
231 if l.r.Err() != nil {
232 return ErrorToken, nil
233 }
234 default:
235 if t := l.consumeNumeric(); t != ErrorToken {
236 return t, l.r.Shift()
237 } else if t := l.consumeIdentlike(); t != ErrorToken {
238 return t, l.r.Shift()
239 }
240 }
241 // can't be rune because consumeIdentlike consumes that as an identifier
242 l.r.Move(1)
243 return DelimToken, l.r.Shift()
244}
245
246////////////////////////////////////////////////////////////////
247
248/*
249The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
250*/
251
252func (l *Lexer) consumeByte(c byte) bool {
253 if l.r.Peek(0) == c {
254 l.r.Move(1)
255 return true
256 }
257 return false
258}
259
260func (l *Lexer) consumeComment() bool {
261 if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
262 return false
263 }
264 l.r.Move(2)
265 for {
266 c := l.r.Peek(0)
267 if c == 0 && l.r.Err() != nil {
268 break
269 } else if c == '*' && l.r.Peek(1) == '/' {
270 l.r.Move(2)
271 return true
272 }
273 l.r.Move(1)
274 }
275 return true
276}
277
278func (l *Lexer) consumeNewline() bool {
279 c := l.r.Peek(0)
280 if c == '\n' || c == '\f' {
281 l.r.Move(1)
282 return true
283 } else if c == '\r' {
284 if l.r.Peek(1) == '\n' {
285 l.r.Move(2)
286 } else {
287 l.r.Move(1)
288 }
289 return true
290 }
291 return false
292}
293
294func (l *Lexer) consumeWhitespace() bool {
295 c := l.r.Peek(0)
296 if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
297 l.r.Move(1)
298 return true
299 }
300 return false
301}
302
303func (l *Lexer) consumeDigit() bool {
304 c := l.r.Peek(0)
305 if c >= '0' && c <= '9' {
306 l.r.Move(1)
307 return true
308 }
309 return false
310}
311
312func (l *Lexer) consumeHexDigit() bool {
313 c := l.r.Peek(0)
314 if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
315 l.r.Move(1)
316 return true
317 }
318 return false
319}
320
321func (l *Lexer) consumeEscape() bool {
322 if l.r.Peek(0) != '\\' {
323 return false
324 }
325 mark := l.r.Pos()
326 l.r.Move(1)
327 if l.consumeNewline() {
328 l.r.Rewind(mark)
329 return false
330 } else if l.consumeHexDigit() {
331 for k := 1; k < 6; k++ {
332 if !l.consumeHexDigit() {
333 break
334 }
335 }
336 l.consumeWhitespace()
337 return true
338 } else {
339 c := l.r.Peek(0)
340 if c >= 0xC0 {
341 _, n := l.r.PeekRune(0)
342 l.r.Move(n)
343 return true
344 } else if c == 0 && l.r.Err() != nil {
345 l.r.Rewind(mark)
346 return false
347 }
348 }
349 l.r.Move(1)
350 return true
351}
352
353func (l *Lexer) consumeIdentToken() bool {
354 mark := l.r.Pos()
355 if l.r.Peek(0) == '-' {
356 l.r.Move(1)
357 }
358 c := l.r.Peek(0)
359 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
360 if c != '\\' || !l.consumeEscape() {
361 l.r.Rewind(mark)
362 return false
363 }
364 } else {
365 l.r.Move(1)
366 }
367 for {
368 c := l.r.Peek(0)
369 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
370 if c != '\\' || !l.consumeEscape() {
371 break
372 }
373 } else {
374 l.r.Move(1)
375 }
376 }
377 return true
378}
379
380// support custom variables, https://www.w3.org/TR/css-variables-1/
381func (l *Lexer) consumeCustomVariableToken() bool {
382 // expect to be on a '-'
383 l.r.Move(1)
384 if l.r.Peek(0) != '-' {
385 l.r.Move(-1)
386 return false
387 }
388 if !l.consumeIdentToken() {
389 l.r.Move(-1)
390 return false
391 }
392 return true
393}
394
395func (l *Lexer) consumeAtKeywordToken() bool {
396 // expect to be on an '@'
397 l.r.Move(1)
398 if !l.consumeIdentToken() {
399 l.r.Move(-1)
400 return false
401 }
402 return true
403}
404
405func (l *Lexer) consumeHashToken() bool {
406 // expect to be on a '#'
407 mark := l.r.Pos()
408 l.r.Move(1)
409 c := l.r.Peek(0)
410 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
411 if c != '\\' || !l.consumeEscape() {
412 l.r.Rewind(mark)
413 return false
414 }
415 } else {
416 l.r.Move(1)
417 }
418 for {
419 c := l.r.Peek(0)
420 if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
421 if c != '\\' || !l.consumeEscape() {
422 break
423 }
424 } else {
425 l.r.Move(1)
426 }
427 }
428 return true
429}
430
431func (l *Lexer) consumeNumberToken() bool {
432 mark := l.r.Pos()
433 c := l.r.Peek(0)
434 if c == '+' || c == '-' {
435 l.r.Move(1)
436 }
437 firstDigit := l.consumeDigit()
438 if firstDigit {
439 for l.consumeDigit() {
440 }
441 }
442 if l.r.Peek(0) == '.' {
443 l.r.Move(1)
444 if l.consumeDigit() {
445 for l.consumeDigit() {
446 }
447 } else if firstDigit {
448 // . could belong to the next token
449 l.r.Move(-1)
450 return true
451 } else {
452 l.r.Rewind(mark)
453 return false
454 }
455 } else if !firstDigit {
456 l.r.Rewind(mark)
457 return false
458 }
459 mark = l.r.Pos()
460 c = l.r.Peek(0)
461 if c == 'e' || c == 'E' {
462 l.r.Move(1)
463 c = l.r.Peek(0)
464 if c == '+' || c == '-' {
465 l.r.Move(1)
466 }
467 if !l.consumeDigit() {
468 // e could belong to next token
469 l.r.Rewind(mark)
470 return true
471 }
472 for l.consumeDigit() {
473 }
474 }
475 return true
476}
477
478func (l *Lexer) consumeUnicodeRangeToken() bool {
479 c := l.r.Peek(0)
480 if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
481 return false
482 }
483 mark := l.r.Pos()
484 l.r.Move(2)
485
486 // consume up to 6 hexDigits
487 k := 0
488 for l.consumeHexDigit() {
489 k++
490 }
491
492 // either a minus or a question mark or the end is expected
493 if l.consumeByte('-') {
494 if k == 0 || 6 < k {
495 l.r.Rewind(mark)
496 return false
497 }
498
499 // consume another up to 6 hexDigits
500 if l.consumeHexDigit() {
501 k = 1
502 for l.consumeHexDigit() {
503 k++
504 }
505 } else {
506 l.r.Rewind(mark)
507 return false
508 }
509 } else if l.consumeByte('?') {
510 // could be filled up to 6 characters with question marks or else regular hexDigits
511 k++
512 for l.consumeByte('?') {
513 k++
514 }
515 }
516 if k == 0 || 6 < k {
517 l.r.Rewind(mark)
518 return false
519 }
520 return true
521}
522
523func (l *Lexer) consumeColumnToken() bool {
524 if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
525 l.r.Move(2)
526 return true
527 }
528 return false
529}
530
531func (l *Lexer) consumeCDOToken() bool {
532 if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
533 l.r.Move(4)
534 return true
535 }
536 return false
537}
538
539func (l *Lexer) consumeCDCToken() bool {
540 if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
541 l.r.Move(3)
542 return true
543 }
544 return false
545}
546
547////////////////////////////////////////////////////////////////
548
549// consumeMatch consumes any MatchToken.
550func (l *Lexer) consumeMatch() TokenType {
551 if l.r.Peek(1) == '=' {
552 switch l.r.Peek(0) {
553 case '~':
554 l.r.Move(2)
555 return IncludeMatchToken
556 case '|':
557 l.r.Move(2)
558 return DashMatchToken
559 case '^':
560 l.r.Move(2)
561 return PrefixMatchToken
562 case '$':
563 l.r.Move(2)
564 return SuffixMatchToken
565 case '*':
566 l.r.Move(2)
567 return SubstringMatchToken
568 }
569 }
570 return ErrorToken
571}
572
573// consumeBracket consumes any bracket token.
574func (l *Lexer) consumeBracket() TokenType {
575 switch l.r.Peek(0) {
576 case '(':
577 l.r.Move(1)
578 return LeftParenthesisToken
579 case ')':
580 l.r.Move(1)
581 return RightParenthesisToken
582 case '[':
583 l.r.Move(1)
584 return LeftBracketToken
585 case ']':
586 l.r.Move(1)
587 return RightBracketToken
588 case '{':
589 l.r.Move(1)
590 return LeftBraceToken
591 case '}':
592 l.r.Move(1)
593 return RightBraceToken
594 }
595 return ErrorToken
596}
597
598// consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
599func (l *Lexer) consumeNumeric() TokenType {
600 if l.consumeNumberToken() {
601 if l.consumeByte('%') {
602 return PercentageToken
603 } else if l.consumeIdentToken() {
604 return DimensionToken
605 }
606 return NumberToken
607 }
608 return ErrorToken
609}
610
611// consumeString consumes a string and may return BadStringToken when a newline is encountered.
612func (l *Lexer) consumeString() TokenType {
613 // assume to be on " or '
614 delim := l.r.Peek(0)
615 l.r.Move(1)
616 for {
617 c := l.r.Peek(0)
618 if c == 0 && l.r.Err() != nil {
619 break
620 } else if c == '\n' || c == '\r' || c == '\f' {
621 l.r.Move(1)
622 return BadStringToken
623 } else if c == delim {
624 l.r.Move(1)
625 break
626 } else if c == '\\' {
627 if !l.consumeEscape() {
628 // either newline or EOF after backslash
629 l.r.Move(1)
630 l.consumeNewline()
631 }
632 } else {
633 l.r.Move(1)
634 }
635 }
636 return StringToken
637}
638
639func (l *Lexer) consumeUnquotedURL() bool {
640 for {
641 c := l.r.Peek(0)
642 if c == 0 && l.r.Err() != nil || c == ')' {
643 break
644 } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
645 if c != '\\' || !l.consumeEscape() {
646 return false
647 }
648 } else {
649 l.r.Move(1)
650 }
651 }
652 return true
653}
654
655// consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
656func (l *Lexer) consumeRemnantsBadURL() {
657 for {
658 if l.consumeByte(')') || l.r.Err() != nil {
659 break
660 } else if !l.consumeEscape() {
661 l.r.Move(1)
662 }
663 }
664}
665
666// consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
667func (l *Lexer) consumeIdentlike() TokenType {
668 if l.consumeIdentToken() {
669 if l.r.Peek(0) != '(' {
670 return IdentToken
671 } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
672 l.r.Move(1)
673 return FunctionToken
674 }
675 l.r.Move(1)
676
677 // consume url
678 for l.consumeWhitespace() {
679 }
680 if c := l.r.Peek(0); c == '"' || c == '\'' {
681 if l.consumeString() == BadStringToken {
682 l.consumeRemnantsBadURL()
683 return BadURLToken
684 }
685 } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() { // if unquoted URL fails due to encountering whitespace, continue
686 l.consumeRemnantsBadURL()
687 return BadURLToken
688 }
689 for l.consumeWhitespace() {
690 }
691 if !l.consumeByte(')') && l.r.Err() != io.EOF {
692 l.consumeRemnantsBadURL()
693 return BadURLToken
694 }
695 return URLToken
696 }
697 return ErrorToken
698}