1package syntax
2
3import (
4 "fmt"
5 "math"
6 "os"
7 "sort"
8 "strconv"
9 "unicode"
10)
11
12type RegexOptions int32
13
14const (
15 IgnoreCase RegexOptions = 0x0001 // "i"
16 Multiline = 0x0002 // "m"
17 ExplicitCapture = 0x0004 // "n"
18 Compiled = 0x0008 // "c"
19 Singleline = 0x0010 // "s"
20 IgnorePatternWhitespace = 0x0020 // "x"
21 RightToLeft = 0x0040 // "r"
22 Debug = 0x0080 // "d"
23 ECMAScript = 0x0100 // "e"
24 RE2 = 0x0200 // RE2 compat mode
25 Unicode = 0x0400 // "u"
26)
27
28func optionFromCode(ch rune) RegexOptions {
29 // case-insensitive
30 switch ch {
31 case 'i', 'I':
32 return IgnoreCase
33 case 'r', 'R':
34 return RightToLeft
35 case 'm', 'M':
36 return Multiline
37 case 'n', 'N':
38 return ExplicitCapture
39 case 's', 'S':
40 return Singleline
41 case 'x', 'X':
42 return IgnorePatternWhitespace
43 case 'd', 'D':
44 return Debug
45 case 'e', 'E':
46 return ECMAScript
47 case 'u', 'U':
48 return Unicode
49 default:
50 return 0
51 }
52}
53
54// An Error describes a failure to parse a regular expression
55// and gives the offending expression.
56type Error struct {
57 Code ErrorCode
58 Expr string
59 Args []interface{}
60}
61
62func (e *Error) Error() string {
63 if len(e.Args) == 0 {
64 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
65 }
66 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
67}
68
69// An ErrorCode describes a failure to parse a regular expression.
70type ErrorCode string
71
72const (
73 // internal issue
74 ErrInternalError ErrorCode = "regexp/syntax: internal error"
75 // Parser errors
76 ErrUnterminatedComment = "unterminated comment"
77 ErrInvalidCharRange = "invalid character class range"
78 ErrInvalidRepeatSize = "invalid repeat count"
79 ErrInvalidUTF8 = "invalid UTF-8"
80 ErrCaptureGroupOutOfRange = "capture group number out of range"
81 ErrUnexpectedParen = "unexpected )"
82 ErrMissingParen = "missing closing )"
83 ErrMissingBrace = "missing closing }"
84 ErrInvalidRepeatOp = "invalid nested repetition operator"
85 ErrMissingRepeatArgument = "missing argument to repetition operator"
86 ErrConditionalExpression = "illegal conditional (?(...)) expression"
87 ErrTooManyAlternates = "too many | in (?()|)"
88 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
89 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
90 ErrCapNumNotZero = "capture number cannot be zero"
91 ErrUndefinedBackRef = "reference to undefined group number %v"
92 ErrUndefinedNameRef = "reference to undefined group name %v"
93 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
94 ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
95 ErrMalformedReference = "(?(%v) ) malformed"
96 ErrUndefinedReference = "(?(%v) ) reference to undefined group"
97 ErrIllegalEndEscape = "illegal \\ at end of pattern"
98 ErrMalformedSlashP = "malformed \\p{X} character escape"
99 ErrIncompleteSlashP = "incomplete \\p{X} character escape"
100 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
101 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
102 ErrMissingControl = "missing control character"
103 ErrUnrecognizedControl = "unrecognized control character"
104 ErrTooFewHex = "insufficient hexadecimal digits"
105 ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
106 ErrMalformedNameRef = "malformed \\k<...> named back reference"
107 ErrBadClassInCharRange = "cannot include class \\%v in character range"
108 ErrUnterminatedBracket = "unterminated [] set"
109 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
110 ErrReversedCharRange = "[%c-%c] range in reverse order"
111)
112
113func (e ErrorCode) String() string {
114 return string(e)
115}
116
117type parser struct {
118 stack *regexNode
119 group *regexNode
120 alternation *regexNode
121 concatenation *regexNode
122 unit *regexNode
123
124 patternRaw string
125 pattern []rune
126
127 currentPos int
128 specialCase *unicode.SpecialCase
129
130 autocap int
131 capcount int
132 captop int
133 capsize int
134
135 caps map[int]int
136 capnames map[string]int
137
138 capnumlist []int
139 capnamelist []string
140
141 options RegexOptions
142 optionsStack []RegexOptions
143 ignoreNextParen bool
144}
145
146const (
147 maxValueDiv10 int = math.MaxInt32 / 10
148 maxValueMod10 = math.MaxInt32 % 10
149)
150
151// Parse converts a regex string into a parse tree
152func Parse(re string, op RegexOptions) (*RegexTree, error) {
153 p := parser{
154 options: op,
155 caps: make(map[int]int),
156 }
157 p.setPattern(re)
158
159 if err := p.countCaptures(); err != nil {
160 return nil, err
161 }
162
163 p.reset(op)
164 root, err := p.scanRegex()
165
166 if err != nil {
167 return nil, err
168 }
169 tree := &RegexTree{
170 root: root,
171 caps: p.caps,
172 capnumlist: p.capnumlist,
173 captop: p.captop,
174 Capnames: p.capnames,
175 Caplist: p.capnamelist,
176 options: op,
177 }
178
179 if tree.options&Debug > 0 {
180 os.Stdout.WriteString(tree.Dump())
181 }
182
183 return tree, nil
184}
185
186func (p *parser) setPattern(pattern string) {
187 p.patternRaw = pattern
188 p.pattern = make([]rune, 0, len(pattern))
189
190 //populate our rune array to handle utf8 encoding
191 for _, r := range pattern {
192 p.pattern = append(p.pattern, r)
193 }
194}
195func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
196 return &Error{Code: code, Expr: p.patternRaw, Args: args}
197}
198
199func (p *parser) noteCaptureSlot(i, pos int) {
200 if _, ok := p.caps[i]; !ok {
201 // the rhs of the hashtable isn't used in the parser
202 p.caps[i] = pos
203 p.capcount++
204
205 if p.captop <= i {
206 if i == math.MaxInt32 {
207 p.captop = i
208 } else {
209 p.captop = i + 1
210 }
211 }
212 }
213}
214
215func (p *parser) noteCaptureName(name string, pos int) {
216 if p.capnames == nil {
217 p.capnames = make(map[string]int)
218 }
219
220 if _, ok := p.capnames[name]; !ok {
221 p.capnames[name] = pos
222 p.capnamelist = append(p.capnamelist, name)
223 }
224}
225
226func (p *parser) assignNameSlots() {
227 if p.capnames != nil {
228 for _, name := range p.capnamelist {
229 for p.isCaptureSlot(p.autocap) {
230 p.autocap++
231 }
232 pos := p.capnames[name]
233 p.capnames[name] = p.autocap
234 p.noteCaptureSlot(p.autocap, pos)
235
236 p.autocap++
237 }
238 }
239
240 // if the caps array has at least one gap, construct the list of used slots
241 if p.capcount < p.captop {
242 p.capnumlist = make([]int, p.capcount)
243 i := 0
244
245 for k := range p.caps {
246 p.capnumlist[i] = k
247 i++
248 }
249
250 sort.Ints(p.capnumlist)
251 }
252
253 // merge capsnumlist into capnamelist
254 if p.capnames != nil || p.capnumlist != nil {
255 var oldcapnamelist []string
256 var next int
257 var k int
258
259 if p.capnames == nil {
260 oldcapnamelist = nil
261 p.capnames = make(map[string]int)
262 p.capnamelist = []string{}
263 next = -1
264 } else {
265 oldcapnamelist = p.capnamelist
266 p.capnamelist = []string{}
267 next = p.capnames[oldcapnamelist[0]]
268 }
269
270 for i := 0; i < p.capcount; i++ {
271 j := i
272 if p.capnumlist != nil {
273 j = p.capnumlist[i]
274 }
275
276 if next == j {
277 p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
278 k++
279
280 if k == len(oldcapnamelist) {
281 next = -1
282 } else {
283 next = p.capnames[oldcapnamelist[k]]
284 }
285
286 } else {
287 //feature: culture?
288 str := strconv.Itoa(j)
289 p.capnamelist = append(p.capnamelist, str)
290 p.capnames[str] = j
291 }
292 }
293 }
294}
295
296func (p *parser) consumeAutocap() int {
297 r := p.autocap
298 p.autocap++
299 return r
300}
301
302// CountCaptures is a prescanner for deducing the slots used for
303// captures by doing a partial tokenization of the pattern.
304func (p *parser) countCaptures() error {
305 var ch rune
306
307 p.noteCaptureSlot(0, 0)
308
309 p.autocap = 1
310
311 for p.charsRight() > 0 {
312 pos := p.textpos()
313 ch = p.moveRightGetChar()
314 switch ch {
315 case '\\':
316 if p.charsRight() > 0 {
317 p.scanBackslash(true)
318 }
319
320 case '#':
321 if p.useOptionX() {
322 p.moveLeft()
323 p.scanBlank()
324 }
325
326 case '[':
327 p.scanCharSet(false, true)
328
329 case ')':
330 if !p.emptyOptionsStack() {
331 p.popOptions()
332 }
333
334 case '(':
335 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
336 p.moveLeft()
337 p.scanBlank()
338 } else {
339 p.pushOptions()
340 if p.charsRight() > 0 && p.rightChar(0) == '?' {
341 // we have (?...
342 p.moveRight(1)
343
344 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
345 // named group: (?<... or (?'...
346
347 p.moveRight(1)
348 ch = p.rightChar(0)
349
350 if ch != '0' && IsWordChar(ch) {
351 if ch >= '1' && ch <= '9' {
352 dec, err := p.scanDecimal()
353 if err != nil {
354 return err
355 }
356 p.noteCaptureSlot(dec, pos)
357 } else {
358 p.noteCaptureName(p.scanCapname(), pos)
359 }
360 }
361 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
362 // RE2-compat (?P<)
363 p.moveRight(2)
364 ch = p.rightChar(0)
365 if IsWordChar(ch) {
366 p.noteCaptureName(p.scanCapname(), pos)
367 }
368
369 } else {
370 // (?...
371
372 // get the options if it's an option construct (?cimsx-cimsx...)
373 p.scanOptions()
374
375 if p.charsRight() > 0 {
376 if p.rightChar(0) == ')' {
377 // (?cimsx-cimsx)
378 p.moveRight(1)
379 p.popKeepOptions()
380 } else if p.rightChar(0) == '(' {
381 // alternation construct: (?(foo)yes|no)
382 // ignore the next paren so we don't capture the condition
383 p.ignoreNextParen = true
384
385 // break from here so we don't reset ignoreNextParen
386 continue
387 }
388 }
389 }
390 } else {
391 if !p.useOptionN() && !p.ignoreNextParen {
392 p.noteCaptureSlot(p.consumeAutocap(), pos)
393 }
394 }
395 }
396
397 p.ignoreNextParen = false
398
399 }
400 }
401
402 p.assignNameSlots()
403 return nil
404}
405
406func (p *parser) reset(topopts RegexOptions) {
407 p.currentPos = 0
408 p.autocap = 1
409 p.ignoreNextParen = false
410
411 if len(p.optionsStack) > 0 {
412 p.optionsStack = p.optionsStack[:0]
413 }
414
415 p.options = topopts
416 p.stack = nil
417}
418
419func (p *parser) scanRegex() (*regexNode, error) {
420 ch := '@' // nonspecial ch, means at beginning
421 isQuant := false
422
423 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
424
425 for p.charsRight() > 0 {
426 wasPrevQuantifier := isQuant
427 isQuant = false
428
429 if err := p.scanBlank(); err != nil {
430 return nil, err
431 }
432
433 startpos := p.textpos()
434
435 // move past all of the normal characters. We'll stop when we hit some kind of control character,
436 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
437 if p.useOptionX() {
438 for p.charsRight() > 0 {
439 ch = p.rightChar(0)
440 //UGLY: clean up, this is ugly
441 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
442 break
443 }
444 p.moveRight(1)
445 }
446 } else {
447 for p.charsRight() > 0 {
448 ch = p.rightChar(0)
449 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
450 break
451 }
452 p.moveRight(1)
453 }
454 }
455
456 endpos := p.textpos()
457
458 p.scanBlank()
459
460 if p.charsRight() == 0 {
461 ch = '!' // nonspecial, means at end
462 } else if ch = p.rightChar(0); isSpecial(ch) {
463 isQuant = isQuantifier(ch)
464 p.moveRight(1)
465 } else {
466 ch = ' ' // nonspecial, means at ordinary char
467 }
468
469 if startpos < endpos {
470 cchUnquantified := endpos - startpos
471 if isQuant {
472 cchUnquantified--
473 }
474 wasPrevQuantifier = false
475
476 if cchUnquantified > 0 {
477 p.addToConcatenate(startpos, cchUnquantified, false)
478 }
479
480 if isQuant {
481 p.addUnitOne(p.charAt(endpos - 1))
482 }
483 }
484
485 switch ch {
486 case '!':
487 goto BreakOuterScan
488
489 case ' ':
490 goto ContinueOuterScan
491
492 case '[':
493 cc, err := p.scanCharSet(p.useOptionI(), false)
494 if err != nil {
495 return nil, err
496 }
497 p.addUnitSet(cc)
498
499 case '(':
500 p.pushOptions()
501
502 if grouper, err := p.scanGroupOpen(); err != nil {
503 return nil, err
504 } else if grouper == nil {
505 p.popKeepOptions()
506 } else {
507 p.pushGroup()
508 p.startGroup(grouper)
509 }
510
511 continue
512
513 case '|':
514 p.addAlternate()
515 goto ContinueOuterScan
516
517 case ')':
518 if p.emptyStack() {
519 return nil, p.getErr(ErrUnexpectedParen)
520 }
521
522 if err := p.addGroup(); err != nil {
523 return nil, err
524 }
525 if err := p.popGroup(); err != nil {
526 return nil, err
527 }
528 p.popOptions()
529
530 if p.unit == nil {
531 goto ContinueOuterScan
532 }
533
534 case '\\':
535 n, err := p.scanBackslash(false)
536 if err != nil {
537 return nil, err
538 }
539 p.addUnitNode(n)
540
541 case '^':
542 if p.useOptionM() {
543 p.addUnitType(ntBol)
544 } else {
545 p.addUnitType(ntBeginning)
546 }
547
548 case '$':
549 if p.useOptionM() {
550 p.addUnitType(ntEol)
551 } else {
552 p.addUnitType(ntEndZ)
553 }
554
555 case '.':
556 if p.useOptionE() {
557 p.addUnitSet(ECMAAnyClass())
558 } else if p.useOptionS() {
559 p.addUnitSet(AnyClass())
560 } else {
561 p.addUnitNotone('\n')
562 }
563
564 case '{', '*', '+', '?':
565 if p.unit == nil {
566 if wasPrevQuantifier {
567 return nil, p.getErr(ErrInvalidRepeatOp)
568 } else {
569 return nil, p.getErr(ErrMissingRepeatArgument)
570 }
571 }
572 p.moveLeft()
573
574 default:
575 return nil, p.getErr(ErrInternalError)
576 }
577
578 if err := p.scanBlank(); err != nil {
579 return nil, err
580 }
581
582 if p.charsRight() > 0 {
583 isQuant = p.isTrueQuantifier()
584 }
585 if p.charsRight() == 0 || !isQuant {
586 //maintain odd C# assignment order -- not sure if required, could clean up?
587 p.addConcatenate()
588 goto ContinueOuterScan
589 }
590
591 ch = p.moveRightGetChar()
592
593 // Handle quantifiers
594 for p.unit != nil {
595 var min, max int
596 var lazy bool
597
598 switch ch {
599 case '*':
600 min = 0
601 max = math.MaxInt32
602
603 case '?':
604 min = 0
605 max = 1
606
607 case '+':
608 min = 1
609 max = math.MaxInt32
610
611 case '{':
612 {
613 var err error
614 startpos = p.textpos()
615 if min, err = p.scanDecimal(); err != nil {
616 return nil, err
617 }
618 max = min
619 if startpos < p.textpos() {
620 if p.charsRight() > 0 && p.rightChar(0) == ',' {
621 p.moveRight(1)
622 if p.charsRight() == 0 || p.rightChar(0) == '}' {
623 max = math.MaxInt32
624 } else {
625 if max, err = p.scanDecimal(); err != nil {
626 return nil, err
627 }
628 }
629 }
630 }
631
632 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
633 p.addConcatenate()
634 p.textto(startpos - 1)
635 goto ContinueOuterScan
636 }
637 }
638
639 default:
640 return nil, p.getErr(ErrInternalError)
641 }
642
643 if err := p.scanBlank(); err != nil {
644 return nil, err
645 }
646
647 if p.charsRight() == 0 || p.rightChar(0) != '?' {
648 lazy = false
649 } else {
650 p.moveRight(1)
651 lazy = true
652 }
653
654 if min > max {
655 return nil, p.getErr(ErrInvalidRepeatSize)
656 }
657
658 p.addConcatenate3(lazy, min, max)
659 }
660
661 ContinueOuterScan:
662 }
663
664BreakOuterScan:
665 ;
666
667 if !p.emptyStack() {
668 return nil, p.getErr(ErrMissingParen)
669 }
670
671 if err := p.addGroup(); err != nil {
672 return nil, err
673 }
674
675 return p.unit, nil
676
677}
678
679/*
680 * Simple parsing for replacement patterns
681 */
682func (p *parser) scanReplacement() (*regexNode, error) {
683 var c, startpos int
684
685 p.concatenation = newRegexNode(ntConcatenate, p.options)
686
687 for {
688 c = p.charsRight()
689 if c == 0 {
690 break
691 }
692
693 startpos = p.textpos()
694
695 for c > 0 && p.rightChar(0) != '$' {
696 p.moveRight(1)
697 c--
698 }
699
700 p.addToConcatenate(startpos, p.textpos()-startpos, true)
701
702 if c > 0 {
703 if p.moveRightGetChar() == '$' {
704 n, err := p.scanDollar()
705 if err != nil {
706 return nil, err
707 }
708 p.addUnitNode(n)
709 }
710 p.addConcatenate()
711 }
712 }
713
714 return p.concatenation, nil
715}
716
717/*
718 * Scans $ patterns recognized within replacement patterns
719 */
720func (p *parser) scanDollar() (*regexNode, error) {
721 if p.charsRight() == 0 {
722 return newRegexNodeCh(ntOne, p.options, '$'), nil
723 }
724
725 ch := p.rightChar(0)
726 angled := false
727 backpos := p.textpos()
728 lastEndPos := backpos
729
730 // Note angle
731
732 if ch == '{' && p.charsRight() > 1 {
733 angled = true
734 p.moveRight(1)
735 ch = p.rightChar(0)
736 }
737
738 // Try to parse backreference: \1 or \{1} or \{cap}
739
740 if ch >= '0' && ch <= '9' {
741 if !angled && p.useOptionE() {
742 capnum := -1
743 newcapnum := int(ch - '0')
744 p.moveRight(1)
745 if p.isCaptureSlot(newcapnum) {
746 capnum = newcapnum
747 lastEndPos = p.textpos()
748 }
749
750 for p.charsRight() > 0 {
751 ch = p.rightChar(0)
752 if ch < '0' || ch > '9' {
753 break
754 }
755 digit := int(ch - '0')
756 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
757 return nil, p.getErr(ErrCaptureGroupOutOfRange)
758 }
759
760 newcapnum = newcapnum*10 + digit
761
762 p.moveRight(1)
763 if p.isCaptureSlot(newcapnum) {
764 capnum = newcapnum
765 lastEndPos = p.textpos()
766 }
767 }
768 p.textto(lastEndPos)
769 if capnum >= 0 {
770 return newRegexNodeM(ntRef, p.options, capnum), nil
771 }
772 } else {
773 capnum, err := p.scanDecimal()
774 if err != nil {
775 return nil, err
776 }
777 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
778 if p.isCaptureSlot(capnum) {
779 return newRegexNodeM(ntRef, p.options, capnum), nil
780 }
781 }
782 }
783 } else if angled && IsWordChar(ch) {
784 capname := p.scanCapname()
785
786 if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
787 if p.isCaptureName(capname) {
788 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
789 }
790 }
791 } else if !angled {
792 capnum := 1
793
794 switch ch {
795 case '$':
796 p.moveRight(1)
797 return newRegexNodeCh(ntOne, p.options, '$'), nil
798 case '&':
799 capnum = 0
800 case '`':
801 capnum = replaceLeftPortion
802 case '\'':
803 capnum = replaceRightPortion
804 case '+':
805 capnum = replaceLastGroup
806 case '_':
807 capnum = replaceWholeString
808 }
809
810 if capnum != 1 {
811 p.moveRight(1)
812 return newRegexNodeM(ntRef, p.options, capnum), nil
813 }
814 }
815
816 // unrecognized $: literalize
817
818 p.textto(backpos)
819 return newRegexNodeCh(ntOne, p.options, '$'), nil
820}
821
822// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
823// a RegexNode for the type of group scanned, or nil if the group
824// simply changed options (?cimsx-cimsx) or was a comment (#...).
825func (p *parser) scanGroupOpen() (*regexNode, error) {
826 var ch rune
827 var nt nodeType
828 var err error
829 close := '>'
830 start := p.textpos()
831
832 // just return a RegexNode if we have:
833 // 1. "(" followed by nothing
834 // 2. "(x" where x != ?
835 // 3. "(?)"
836 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
837 if p.useOptionN() || p.ignoreNextParen {
838 p.ignoreNextParen = false
839 return newRegexNode(ntGroup, p.options), nil
840 }
841 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
842 }
843
844 p.moveRight(1)
845
846 for {
847 if p.charsRight() == 0 {
848 break
849 }
850
851 switch ch = p.moveRightGetChar(); ch {
852 case ':':
853 nt = ntGroup
854
855 case '=':
856 p.options &= ^RightToLeft
857 nt = ntRequire
858
859 case '!':
860 p.options &= ^RightToLeft
861 nt = ntPrevent
862
863 case '>':
864 nt = ntGreedy
865
866 case '\'':
867 close = '\''
868 fallthrough
869
870 case '<':
871 if p.charsRight() == 0 {
872 goto BreakRecognize
873 }
874
875 switch ch = p.moveRightGetChar(); ch {
876 case '=':
877 if close == '\'' {
878 goto BreakRecognize
879 }
880
881 p.options |= RightToLeft
882 nt = ntRequire
883
884 case '!':
885 if close == '\'' {
886 goto BreakRecognize
887 }
888
889 p.options |= RightToLeft
890 nt = ntPrevent
891
892 default:
893 p.moveLeft()
894 capnum := -1
895 uncapnum := -1
896 proceed := false
897
898 // grab part before -
899
900 if ch >= '0' && ch <= '9' {
901 if capnum, err = p.scanDecimal(); err != nil {
902 return nil, err
903 }
904
905 if !p.isCaptureSlot(capnum) {
906 capnum = -1
907 }
908
909 // check if we have bogus characters after the number
910 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
911 return nil, p.getErr(ErrInvalidGroupName)
912 }
913 if capnum == 0 {
914 return nil, p.getErr(ErrCapNumNotZero)
915 }
916 } else if IsWordChar(ch) {
917 capname := p.scanCapname()
918
919 if p.isCaptureName(capname) {
920 capnum = p.captureSlotFromName(capname)
921 }
922
923 // check if we have bogus character after the name
924 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
925 return nil, p.getErr(ErrInvalidGroupName)
926 }
927 } else if ch == '-' {
928 proceed = true
929 } else {
930 // bad group name - starts with something other than a word character and isn't a number
931 return nil, p.getErr(ErrInvalidGroupName)
932 }
933
934 // grab part after - if any
935
936 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
937 p.moveRight(1)
938
939 //no more chars left, no closing char, etc
940 if p.charsRight() == 0 {
941 return nil, p.getErr(ErrInvalidGroupName)
942 }
943
944 ch = p.rightChar(0)
945 if ch >= '0' && ch <= '9' {
946 if uncapnum, err = p.scanDecimal(); err != nil {
947 return nil, err
948 }
949
950 if !p.isCaptureSlot(uncapnum) {
951 return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
952 }
953
954 // check if we have bogus characters after the number
955 if p.charsRight() > 0 && p.rightChar(0) != close {
956 return nil, p.getErr(ErrInvalidGroupName)
957 }
958 } else if IsWordChar(ch) {
959 uncapname := p.scanCapname()
960
961 if !p.isCaptureName(uncapname) {
962 return nil, p.getErr(ErrUndefinedNameRef, uncapname)
963 }
964 uncapnum = p.captureSlotFromName(uncapname)
965
966 // check if we have bogus character after the name
967 if p.charsRight() > 0 && p.rightChar(0) != close {
968 return nil, p.getErr(ErrInvalidGroupName)
969 }
970 } else {
971 // bad group name - starts with something other than a word character and isn't a number
972 return nil, p.getErr(ErrInvalidGroupName)
973 }
974 }
975
976 // actually make the node
977
978 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
979 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
980 }
981 goto BreakRecognize
982 }
983
984 case '(':
985 // alternation construct (?(...) | )
986
987 parenPos := p.textpos()
988 if p.charsRight() > 0 {
989 ch = p.rightChar(0)
990
991 // check if the alternation condition is a backref
992 if ch >= '0' && ch <= '9' {
993 var capnum int
994 if capnum, err = p.scanDecimal(); err != nil {
995 return nil, err
996 }
997 if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
998 if p.isCaptureSlot(capnum) {
999 return newRegexNodeM(ntTestref, p.options, capnum), nil
1000 }
1001 return nil, p.getErr(ErrUndefinedReference, capnum)
1002 }
1003
1004 return nil, p.getErr(ErrMalformedReference, capnum)
1005
1006 } else if IsWordChar(ch) {
1007 capname := p.scanCapname()
1008
1009 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1010 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1011 }
1012 }
1013 }
1014 // not a backref
1015 nt = ntTestgroup
1016 p.textto(parenPos - 1) // jump to the start of the parentheses
1017 p.ignoreNextParen = true // but make sure we don't try to capture the insides
1018
1019 charsRight := p.charsRight()
1020 if charsRight >= 3 && p.rightChar(1) == '?' {
1021 rightchar2 := p.rightChar(2)
1022 // disallow comments in the condition
1023 if rightchar2 == '#' {
1024 return nil, p.getErr(ErrAlternationCantHaveComment)
1025 }
1026
1027 // disallow named capture group (?<..>..) in the condition
1028 if rightchar2 == '\'' {
1029 return nil, p.getErr(ErrAlternationCantCapture)
1030 }
1031
1032 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1033 return nil, p.getErr(ErrAlternationCantCapture)
1034 }
1035 }
1036
1037 case 'P':
1038 if p.useRE2() {
1039 // support for P<name> syntax
1040 if p.charsRight() < 3 {
1041 goto BreakRecognize
1042 }
1043
1044 ch = p.moveRightGetChar()
1045 if ch != '<' {
1046 goto BreakRecognize
1047 }
1048
1049 ch = p.moveRightGetChar()
1050 p.moveLeft()
1051
1052 if IsWordChar(ch) {
1053 capnum := -1
1054 capname := p.scanCapname()
1055
1056 if p.isCaptureName(capname) {
1057 capnum = p.captureSlotFromName(capname)
1058 }
1059
1060 // check if we have bogus character after the name
1061 if p.charsRight() > 0 && p.rightChar(0) != '>' {
1062 return nil, p.getErr(ErrInvalidGroupName)
1063 }
1064
1065 // actually make the node
1066
1067 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1068 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1069 }
1070 goto BreakRecognize
1071
1072 } else {
1073 // bad group name - starts with something other than a word character and isn't a number
1074 return nil, p.getErr(ErrInvalidGroupName)
1075 }
1076 }
1077 // if we're not using RE2 compat mode then
1078 // we just behave like normal
1079 fallthrough
1080
1081 default:
1082 p.moveLeft()
1083
1084 nt = ntGroup
1085 // disallow options in the children of a testgroup node
1086 if p.group.t != ntTestgroup {
1087 p.scanOptions()
1088 }
1089 if p.charsRight() == 0 {
1090 goto BreakRecognize
1091 }
1092
1093 if ch = p.moveRightGetChar(); ch == ')' {
1094 return nil, nil
1095 }
1096
1097 if ch != ':' {
1098 goto BreakRecognize
1099 }
1100
1101 }
1102
1103 return newRegexNode(nt, p.options), nil
1104 }
1105
1106BreakRecognize:
1107
1108 // break Recognize comes here
1109
1110 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1111}
1112
1113// scans backslash specials and basics
1114func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1115
1116 if p.charsRight() == 0 {
1117 return nil, p.getErr(ErrIllegalEndEscape)
1118 }
1119
1120 switch ch := p.rightChar(0); ch {
1121 case 'b', 'B', 'A', 'G', 'Z', 'z':
1122 p.moveRight(1)
1123 return newRegexNode(p.typeFromCode(ch), p.options), nil
1124
1125 case 'w':
1126 p.moveRight(1)
1127 if p.useOptionE() || p.useRE2() {
1128 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1129 }
1130 return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1131
1132 case 'W':
1133 p.moveRight(1)
1134 if p.useOptionE() || p.useRE2() {
1135 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1136 }
1137 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1138
1139 case 's':
1140 p.moveRight(1)
1141 if p.useOptionE() {
1142 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1143 } else if p.useRE2() {
1144 return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
1145 }
1146 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1147
1148 case 'S':
1149 p.moveRight(1)
1150 if p.useOptionE() {
1151 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1152 } else if p.useRE2() {
1153 return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
1154 }
1155 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1156
1157 case 'd':
1158 p.moveRight(1)
1159 if p.useOptionE() || p.useRE2() {
1160 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1161 }
1162 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1163
1164 case 'D':
1165 p.moveRight(1)
1166 if p.useOptionE() || p.useRE2() {
1167 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1168 }
1169 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1170
1171 case 'p', 'P':
1172 p.moveRight(1)
1173 prop, err := p.parseProperty()
1174 if err != nil {
1175 return nil, err
1176 }
1177 cc := &CharSet{}
1178 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1179 if p.useOptionI() {
1180 cc.addLowercase()
1181 }
1182
1183 return newRegexNodeSet(ntSet, p.options, cc), nil
1184
1185 default:
1186 return p.scanBasicBackslash(scanOnly)
1187 }
1188}
1189
1190// Scans \-style backreferences and character escapes
1191func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1192 if p.charsRight() == 0 {
1193 return nil, p.getErr(ErrIllegalEndEscape)
1194 }
1195 angled := false
1196 k := false
1197 close := '\x00'
1198
1199 backpos := p.textpos()
1200 ch := p.rightChar(0)
1201
1202 // Allow \k<foo> instead of \<foo>, which is now deprecated.
1203
1204 // According to ECMAScript specification, \k<name> is only parsed as a named group reference if
1205 // there is at least one group name in the regexp.
1206 // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
1207 // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
1208 if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
1209 if p.charsRight() >= 2 {
1210 p.moveRight(1)
1211 ch = p.moveRightGetChar()
1212
1213 if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
1214 angled = true
1215 if ch == '\'' {
1216 close = '\''
1217 } else {
1218 close = '>'
1219 }
1220 }
1221 }
1222
1223 if !angled || p.charsRight() <= 0 {
1224 return nil, p.getErr(ErrMalformedNameRef)
1225 }
1226
1227 ch = p.rightChar(0)
1228 k = true
1229
1230 } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1231 angled = true
1232 if ch == '\'' {
1233 close = '\''
1234 } else {
1235 close = '>'
1236 }
1237
1238 p.moveRight(1)
1239 ch = p.rightChar(0)
1240 }
1241
1242 // Try to parse backreference: \<1> or \<cap>
1243
1244 if angled && ch >= '0' && ch <= '9' {
1245 capnum, err := p.scanDecimal()
1246 if err != nil {
1247 return nil, err
1248 }
1249
1250 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1251 if p.isCaptureSlot(capnum) {
1252 return newRegexNodeM(ntRef, p.options, capnum), nil
1253 }
1254 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1255 }
1256 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1257 capnum, err := p.scanDecimal()
1258 if err != nil {
1259 return nil, err
1260 }
1261
1262 if scanOnly {
1263 return nil, nil
1264 }
1265
1266 if p.isCaptureSlot(capnum) {
1267 return newRegexNodeM(ntRef, p.options, capnum), nil
1268 }
1269 if capnum <= 9 && !p.useOptionE() {
1270 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1271 }
1272
1273 } else if angled {
1274 capname := p.scanCapname()
1275
1276 if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
1277
1278 if scanOnly {
1279 return nil, nil
1280 }
1281
1282 if p.isCaptureName(capname) {
1283 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1284 }
1285 return nil, p.getErr(ErrUndefinedNameRef, capname)
1286 } else {
1287 if k {
1288 return nil, p.getErr(ErrMalformedNameRef)
1289 }
1290 }
1291 }
1292
1293 // Not backreference: must be char code
1294
1295 p.textto(backpos)
1296 ch, err := p.scanCharEscape()
1297 if err != nil {
1298 return nil, err
1299 }
1300
1301 if scanOnly {
1302 return nil, nil
1303 }
1304
1305 if p.useOptionI() {
1306 ch = unicode.ToLower(ch)
1307 }
1308
1309 return newRegexNodeCh(ntOne, p.options, ch), nil
1310}
1311
1312// Scans X for \p{X} or \P{X}
1313func (p *parser) parseProperty() (string, error) {
1314 if p.charsRight() < 3 {
1315 return "", p.getErr(ErrIncompleteSlashP)
1316 }
1317 ch := p.moveRightGetChar()
1318 if ch != '{' {
1319 return "", p.getErr(ErrMalformedSlashP)
1320 }
1321
1322 startpos := p.textpos()
1323 for p.charsRight() > 0 {
1324 ch = p.moveRightGetChar()
1325 if !(IsWordChar(ch) || ch == '-') {
1326 p.moveLeft()
1327 break
1328 }
1329 }
1330 capname := string(p.pattern[startpos:p.textpos()])
1331
1332 if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1333 return "", p.getErr(ErrIncompleteSlashP)
1334 }
1335
1336 if !isValidUnicodeCat(capname) {
1337 return "", p.getErr(ErrUnknownSlashP, capname)
1338 }
1339
1340 return capname, nil
1341}
1342
1343// Returns ReNode type for zero-length assertions with a \ code.
1344func (p *parser) typeFromCode(ch rune) nodeType {
1345 switch ch {
1346 case 'b':
1347 if p.useOptionE() {
1348 return ntECMABoundary
1349 }
1350 return ntBoundary
1351 case 'B':
1352 if p.useOptionE() {
1353 return ntNonECMABoundary
1354 }
1355 return ntNonboundary
1356 case 'A':
1357 return ntBeginning
1358 case 'G':
1359 return ntStart
1360 case 'Z':
1361 return ntEndZ
1362 case 'z':
1363 return ntEnd
1364 default:
1365 return ntNothing
1366 }
1367}
1368
1369// Scans whitespace or x-mode comments.
1370func (p *parser) scanBlank() error {
1371 if p.useOptionX() {
1372 for {
1373 for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1374 p.moveRight(1)
1375 }
1376
1377 if p.charsRight() == 0 {
1378 break
1379 }
1380
1381 if p.rightChar(0) == '#' {
1382 for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1383 p.moveRight(1)
1384 }
1385 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1386 p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1387 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1388 p.moveRight(1)
1389 }
1390 if p.charsRight() == 0 {
1391 return p.getErr(ErrUnterminatedComment)
1392 }
1393 p.moveRight(1)
1394 } else {
1395 break
1396 }
1397 }
1398 } else {
1399 for {
1400 if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1401 p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1402 return nil
1403 }
1404
1405 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1406 p.moveRight(1)
1407 }
1408 if p.charsRight() == 0 {
1409 return p.getErr(ErrUnterminatedComment)
1410 }
1411 p.moveRight(1)
1412 }
1413 }
1414 return nil
1415}
1416
1417func (p *parser) scanCapname() string {
1418 startpos := p.textpos()
1419
1420 for p.charsRight() > 0 {
1421 if !IsWordChar(p.moveRightGetChar()) {
1422 p.moveLeft()
1423 break
1424 }
1425 }
1426
1427 return string(p.pattern[startpos:p.textpos()])
1428}
1429
1430//Scans contents of [] (not including []'s), and converts to a set.
1431func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1432 ch := '\x00'
1433 chPrev := '\x00'
1434 inRange := false
1435 firstChar := true
1436 closed := false
1437
1438 var cc *CharSet
1439 if !scanOnly {
1440 cc = &CharSet{}
1441 }
1442
1443 if p.charsRight() > 0 && p.rightChar(0) == '^' {
1444 p.moveRight(1)
1445 if !scanOnly {
1446 cc.negate = true
1447 }
1448 }
1449
1450 for ; p.charsRight() > 0; firstChar = false {
1451 fTranslatedChar := false
1452 ch = p.moveRightGetChar()
1453 if ch == ']' {
1454 if !firstChar {
1455 closed = true
1456 break
1457 } else if p.useOptionE() {
1458 if !scanOnly {
1459 cc.addRanges(NoneClass().ranges)
1460 }
1461 closed = true
1462 break
1463 }
1464
1465 } else if ch == '\\' && p.charsRight() > 0 {
1466 switch ch = p.moveRightGetChar(); ch {
1467 case 'D', 'd':
1468 if !scanOnly {
1469 if inRange {
1470 return nil, p.getErr(ErrBadClassInCharRange, ch)
1471 }
1472 cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
1473 }
1474 continue
1475
1476 case 'S', 's':
1477 if !scanOnly {
1478 if inRange {
1479 return nil, p.getErr(ErrBadClassInCharRange, ch)
1480 }
1481 cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
1482 }
1483 continue
1484
1485 case 'W', 'w':
1486 if !scanOnly {
1487 if inRange {
1488 return nil, p.getErr(ErrBadClassInCharRange, ch)
1489 }
1490
1491 cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
1492 }
1493 continue
1494
1495 case 'p', 'P':
1496 if !scanOnly {
1497 if inRange {
1498 return nil, p.getErr(ErrBadClassInCharRange, ch)
1499 }
1500 prop, err := p.parseProperty()
1501 if err != nil {
1502 return nil, err
1503 }
1504 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1505 } else {
1506 p.parseProperty()
1507 }
1508
1509 continue
1510
1511 case '-':
1512 if !scanOnly {
1513 cc.addRange(ch, ch)
1514 }
1515 continue
1516
1517 default:
1518 p.moveLeft()
1519 var err error
1520 ch, err = p.scanCharEscape() // non-literal character
1521 if err != nil {
1522 return nil, err
1523 }
1524 fTranslatedChar = true
1525 break // this break will only break out of the switch
1526 }
1527 } else if ch == '[' {
1528 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1529 // It currently doesn't do anything other than skip the whole thing!
1530 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1531 savePos := p.textpos()
1532
1533 p.moveRight(1)
1534 negate := false
1535 if p.charsRight() > 1 && p.rightChar(0) == '^' {
1536 negate = true
1537 p.moveRight(1)
1538 }
1539
1540 nm := p.scanCapname() // snag the name
1541 if !scanOnly && p.useRE2() {
1542 // look up the name since these are valid for RE2
1543 // add the group based on the name
1544 if ok := cc.addNamedASCII(nm, negate); !ok {
1545 return nil, p.getErr(ErrInvalidCharRange)
1546 }
1547 }
1548 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1549 p.textto(savePos)
1550 } else if p.useRE2() {
1551 // move on
1552 continue
1553 }
1554 }
1555 }
1556
1557 if inRange {
1558 inRange = false
1559 if !scanOnly {
1560 if ch == '[' && !fTranslatedChar && !firstChar {
1561 // We thought we were in a range, but we're actually starting a subtraction.
1562 // In that case, we'll add chPrev to our char class, skip the opening [, and
1563 // scan the new character class recursively.
1564 cc.addChar(chPrev)
1565 sub, err := p.scanCharSet(caseInsensitive, false)
1566 if err != nil {
1567 return nil, err
1568 }
1569 cc.addSubtraction(sub)
1570
1571 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1572 return nil, p.getErr(ErrSubtractionMustBeLast)
1573 }
1574 } else {
1575 // a regular range, like a-z
1576 if chPrev > ch {
1577 return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
1578 }
1579 cc.addRange(chPrev, ch)
1580 }
1581 }
1582 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1583 // this could be the start of a range
1584 chPrev = ch
1585 inRange = true
1586 p.moveRight(1)
1587 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1588 // we aren't in a range, and now there is a subtraction. Usually this happens
1589 // only when a subtraction follows a range, like [a-z-[b]]
1590 if !scanOnly {
1591 p.moveRight(1)
1592 sub, err := p.scanCharSet(caseInsensitive, false)
1593 if err != nil {
1594 return nil, err
1595 }
1596 cc.addSubtraction(sub)
1597
1598 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1599 return nil, p.getErr(ErrSubtractionMustBeLast)
1600 }
1601 } else {
1602 p.moveRight(1)
1603 p.scanCharSet(caseInsensitive, true)
1604 }
1605 } else {
1606 if !scanOnly {
1607 cc.addRange(ch, ch)
1608 }
1609 }
1610 }
1611
1612 if !closed {
1613 return nil, p.getErr(ErrUnterminatedBracket)
1614 }
1615
1616 if !scanOnly && caseInsensitive {
1617 cc.addLowercase()
1618 }
1619
1620 return cc, nil
1621}
1622
1623// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1624func (p *parser) scanDecimal() (int, error) {
1625 i := 0
1626 var d int
1627
1628 for p.charsRight() > 0 {
1629 d = int(p.rightChar(0) - '0')
1630 if d < 0 || d > 9 {
1631 break
1632 }
1633 p.moveRight(1)
1634
1635 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1636 return 0, p.getErr(ErrCaptureGroupOutOfRange)
1637 }
1638
1639 i *= 10
1640 i += d
1641 }
1642
1643 return int(i), nil
1644}
1645
1646// Returns true for options allowed only at the top level
1647func isOnlyTopOption(option RegexOptions) bool {
1648 return option == RightToLeft || option == ECMAScript || option == RE2
1649}
1650
1651// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1652func (p *parser) scanOptions() {
1653
1654 for off := false; p.charsRight() > 0; p.moveRight(1) {
1655 ch := p.rightChar(0)
1656
1657 if ch == '-' {
1658 off = true
1659 } else if ch == '+' {
1660 off = false
1661 } else {
1662 option := optionFromCode(ch)
1663 if option == 0 || isOnlyTopOption(option) {
1664 return
1665 }
1666
1667 if off {
1668 p.options &= ^option
1669 } else {
1670 p.options |= option
1671 }
1672 }
1673 }
1674}
1675
1676// Scans \ code for escape codes that map to single unicode chars.
1677func (p *parser) scanCharEscape() (r rune, err error) {
1678
1679 ch := p.moveRightGetChar()
1680
1681 if ch >= '0' && ch <= '7' {
1682 p.moveLeft()
1683 return p.scanOctal(), nil
1684 }
1685
1686 pos := p.textpos()
1687
1688 switch ch {
1689 case 'x':
1690 // support for \x{HEX} syntax from Perl and PCRE
1691 if p.charsRight() > 0 && p.rightChar(0) == '{' {
1692 if p.useOptionE() {
1693 return ch, nil
1694 }
1695 p.moveRight(1)
1696 return p.scanHexUntilBrace()
1697 } else {
1698 r, err = p.scanHex(2)
1699 }
1700 case 'u':
1701 // ECMAscript suppot \u{HEX} only if `u` is also set
1702 if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1703 p.moveRight(1)
1704 return p.scanHexUntilBrace()
1705 } else {
1706 r, err = p.scanHex(4)
1707 }
1708 case 'a':
1709 return '\u0007', nil
1710 case 'b':
1711 return '\b', nil
1712 case 'e':
1713 return '\u001B', nil
1714 case 'f':
1715 return '\f', nil
1716 case 'n':
1717 return '\n', nil
1718 case 'r':
1719 return '\r', nil
1720 case 't':
1721 return '\t', nil
1722 case 'v':
1723 return '\u000B', nil
1724 case 'c':
1725 r, err = p.scanControl()
1726 default:
1727 if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
1728 return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1729 }
1730 return ch, nil
1731 }
1732 if err != nil && p.useOptionE() {
1733 p.textto(pos)
1734 return ch, nil
1735 }
1736 return
1737}
1738
1739// Grabs and converts an ascii control character
1740func (p *parser) scanControl() (rune, error) {
1741 if p.charsRight() <= 0 {
1742 return 0, p.getErr(ErrMissingControl)
1743 }
1744
1745 ch := p.moveRightGetChar()
1746
1747 // \ca interpreted as \cA
1748
1749 if ch >= 'a' && ch <= 'z' {
1750 ch = (ch - ('a' - 'A'))
1751 }
1752 ch = (ch - '@')
1753 if ch >= 0 && ch < ' ' {
1754 return ch, nil
1755 }
1756
1757 return 0, p.getErr(ErrUnrecognizedControl)
1758
1759}
1760
1761// Scan hex digits until we hit a closing brace.
1762// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1763func (p *parser) scanHexUntilBrace() (rune, error) {
1764 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1765 // so we can enforce that
1766 i := 0
1767 hasContent := false
1768
1769 for p.charsRight() > 0 {
1770 ch := p.moveRightGetChar()
1771 if ch == '}' {
1772 // hit our close brace, we're done here
1773 // prevent \x{}
1774 if !hasContent {
1775 return 0, p.getErr(ErrTooFewHex)
1776 }
1777 return rune(i), nil
1778 }
1779 hasContent = true
1780 // no brace needs to be hex digit
1781 d := hexDigit(ch)
1782 if d < 0 {
1783 return 0, p.getErr(ErrMissingBrace)
1784 }
1785
1786 i *= 0x10
1787 i += d
1788
1789 if i > unicode.MaxRune {
1790 return 0, p.getErr(ErrInvalidHex)
1791 }
1792 }
1793
1794 // we only make it here if we run out of digits without finding the brace
1795 return 0, p.getErr(ErrMissingBrace)
1796}
1797
1798// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1799func (p *parser) scanHex(c int) (rune, error) {
1800
1801 i := 0
1802
1803 if p.charsRight() >= c {
1804 for c > 0 {
1805 d := hexDigit(p.moveRightGetChar())
1806 if d < 0 {
1807 break
1808 }
1809 i *= 0x10
1810 i += d
1811 c--
1812 }
1813 }
1814
1815 if c > 0 {
1816 return 0, p.getErr(ErrTooFewHex)
1817 }
1818
1819 return rune(i), nil
1820}
1821
1822// Returns n <= 0xF for a hex digit.
1823func hexDigit(ch rune) int {
1824
1825 if d := uint(ch - '0'); d <= 9 {
1826 return int(d)
1827 }
1828
1829 if d := uint(ch - 'a'); d <= 5 {
1830 return int(d + 0xa)
1831 }
1832
1833 if d := uint(ch - 'A'); d <= 5 {
1834 return int(d + 0xa)
1835 }
1836
1837 return -1
1838}
1839
1840// Scans up to three octal digits (stops before exceeding 0377).
1841func (p *parser) scanOctal() rune {
1842 // Consume octal chars only up to 3 digits and value 0377
1843
1844 c := 3
1845
1846 if c > p.charsRight() {
1847 c = p.charsRight()
1848 }
1849
1850 //we know the first char is good because the caller had to check
1851 i := 0
1852 d := int(p.rightChar(0) - '0')
1853 for c > 0 && d <= 7 && d >= 0 {
1854 if i >= 0x20 && p.useOptionE() {
1855 break
1856 }
1857 i *= 8
1858 i += d
1859 c--
1860
1861 p.moveRight(1)
1862 if !p.rightMost() {
1863 d = int(p.rightChar(0) - '0')
1864 }
1865 }
1866
1867 // Octal codes only go up to 255. Any larger and the behavior that Perl follows
1868 // is simply to truncate the high bits.
1869 i &= 0xFF
1870
1871 return rune(i)
1872}
1873
1874// Returns the current parsing position.
1875func (p *parser) textpos() int {
1876 return p.currentPos
1877}
1878
1879// Zaps to a specific parsing position.
1880func (p *parser) textto(pos int) {
1881 p.currentPos = pos
1882}
1883
1884// Returns the char at the right of the current parsing position and advances to the right.
1885func (p *parser) moveRightGetChar() rune {
1886 ch := p.pattern[p.currentPos]
1887 p.currentPos++
1888 return ch
1889}
1890
1891// Moves the current position to the right.
1892func (p *parser) moveRight(i int) {
1893 // default would be 1
1894 p.currentPos += i
1895}
1896
1897// Moves the current parsing position one to the left.
1898func (p *parser) moveLeft() {
1899 p.currentPos--
1900}
1901
1902// Returns the char left of the current parsing position.
1903func (p *parser) charAt(i int) rune {
1904 return p.pattern[i]
1905}
1906
1907// Returns the char i chars right of the current parsing position.
1908func (p *parser) rightChar(i int) rune {
1909 // default would be 0
1910 return p.pattern[p.currentPos+i]
1911}
1912
1913// Number of characters to the right of the current parsing position.
1914func (p *parser) charsRight() int {
1915 return len(p.pattern) - p.currentPos
1916}
1917
1918func (p *parser) rightMost() bool {
1919 return p.currentPos == len(p.pattern)
1920}
1921
1922// Looks up the slot number for a given name
1923func (p *parser) captureSlotFromName(capname string) int {
1924 return p.capnames[capname]
1925}
1926
1927// True if the capture slot was noted
1928func (p *parser) isCaptureSlot(i int) bool {
1929 if p.caps != nil {
1930 _, ok := p.caps[i]
1931 return ok
1932 }
1933
1934 return (i >= 0 && i < p.capsize)
1935}
1936
1937// Looks up the slot number for a given name
1938func (p *parser) isCaptureName(capname string) bool {
1939 if p.capnames == nil {
1940 return false
1941 }
1942
1943 _, ok := p.capnames[capname]
1944 return ok
1945}
1946
1947// option shortcuts
1948
1949// True if N option disabling '(' autocapture is on.
1950func (p *parser) useOptionN() bool {
1951 return (p.options & ExplicitCapture) != 0
1952}
1953
1954// True if I option enabling case-insensitivity is on.
1955func (p *parser) useOptionI() bool {
1956 return (p.options & IgnoreCase) != 0
1957}
1958
1959// True if M option altering meaning of $ and ^ is on.
1960func (p *parser) useOptionM() bool {
1961 return (p.options & Multiline) != 0
1962}
1963
1964// True if S option altering meaning of . is on.
1965func (p *parser) useOptionS() bool {
1966 return (p.options & Singleline) != 0
1967}
1968
1969// True if X option enabling whitespace/comment mode is on.
1970func (p *parser) useOptionX() bool {
1971 return (p.options & IgnorePatternWhitespace) != 0
1972}
1973
1974// True if E option enabling ECMAScript behavior on.
1975func (p *parser) useOptionE() bool {
1976 return (p.options & ECMAScript) != 0
1977}
1978
1979// true to use RE2 compatibility parsing behavior.
1980func (p *parser) useRE2() bool {
1981 return (p.options & RE2) != 0
1982}
1983
1984// True if U option enabling ECMAScript's Unicode behavior on.
1985func (p *parser) useOptionU() bool {
1986 return (p.options & Unicode) != 0
1987}
1988
1989// True if options stack is empty.
1990func (p *parser) emptyOptionsStack() bool {
1991 return len(p.optionsStack) == 0
1992}
1993
1994// Finish the current quantifiable (when a quantifier is not found or is not possible)
1995func (p *parser) addConcatenate() {
1996 // The first (| inside a Testgroup group goes directly to the group
1997 p.concatenation.addChild(p.unit)
1998 p.unit = nil
1999}
2000
2001// Finish the current quantifiable (when a quantifier is found)
2002func (p *parser) addConcatenate3(lazy bool, min, max int) {
2003 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
2004 p.unit = nil
2005}
2006
2007// Sets the current unit to a single char node
2008func (p *parser) addUnitOne(ch rune) {
2009 if p.useOptionI() {
2010 ch = unicode.ToLower(ch)
2011 }
2012
2013 p.unit = newRegexNodeCh(ntOne, p.options, ch)
2014}
2015
2016// Sets the current unit to a single inverse-char node
2017func (p *parser) addUnitNotone(ch rune) {
2018 if p.useOptionI() {
2019 ch = unicode.ToLower(ch)
2020 }
2021
2022 p.unit = newRegexNodeCh(ntNotone, p.options, ch)
2023}
2024
2025// Sets the current unit to a single set node
2026func (p *parser) addUnitSet(set *CharSet) {
2027 p.unit = newRegexNodeSet(ntSet, p.options, set)
2028}
2029
2030// Sets the current unit to a subtree
2031func (p *parser) addUnitNode(node *regexNode) {
2032 p.unit = node
2033}
2034
2035// Sets the current unit to an assertion of the specified type
2036func (p *parser) addUnitType(t nodeType) {
2037 p.unit = newRegexNode(t, p.options)
2038}
2039
2040// Finish the current group (in response to a ')' or end)
2041func (p *parser) addGroup() error {
2042 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2043 p.group.addChild(p.concatenation.reverseLeft())
2044 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2045 return p.getErr(ErrTooManyAlternates)
2046 }
2047 } else {
2048 p.alternation.addChild(p.concatenation.reverseLeft())
2049 p.group.addChild(p.alternation)
2050 }
2051
2052 p.unit = p.group
2053 return nil
2054}
2055
2056// Pops the option stack, but keeps the current options unchanged.
2057func (p *parser) popKeepOptions() {
2058 lastIdx := len(p.optionsStack) - 1
2059 p.optionsStack = p.optionsStack[:lastIdx]
2060}
2061
2062// Recalls options from the stack.
2063func (p *parser) popOptions() {
2064 lastIdx := len(p.optionsStack) - 1
2065 // get the last item on the stack and then remove it by reslicing
2066 p.options = p.optionsStack[lastIdx]
2067 p.optionsStack = p.optionsStack[:lastIdx]
2068}
2069
2070// Saves options on a stack.
2071func (p *parser) pushOptions() {
2072 p.optionsStack = append(p.optionsStack, p.options)
2073}
2074
2075// Add a string to the last concatenate.
2076func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2077 var node *regexNode
2078
2079 if cch == 0 {
2080 return
2081 }
2082
2083 if cch > 1 {
2084 str := make([]rune, cch)
2085 copy(str, p.pattern[pos:pos+cch])
2086
2087 if p.useOptionI() && !isReplacement {
2088 // We do the ToLower character by character for consistency. With surrogate chars, doing
2089 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
2090 // linguistically, but since Regex doesn't support surrogates, it's more important to be
2091 // consistent.
2092 for i := 0; i < len(str); i++ {
2093 str[i] = unicode.ToLower(str[i])
2094 }
2095 }
2096
2097 node = newRegexNodeStr(ntMulti, p.options, str)
2098 } else {
2099 ch := p.charAt(pos)
2100
2101 if p.useOptionI() && !isReplacement {
2102 ch = unicode.ToLower(ch)
2103 }
2104
2105 node = newRegexNodeCh(ntOne, p.options, ch)
2106 }
2107
2108 p.concatenation.addChild(node)
2109}
2110
2111// Push the parser state (in response to an open paren)
2112func (p *parser) pushGroup() {
2113 p.group.next = p.stack
2114 p.alternation.next = p.group
2115 p.concatenation.next = p.alternation
2116 p.stack = p.concatenation
2117}
2118
2119// Remember the pushed state (in response to a ')')
2120func (p *parser) popGroup() error {
2121 p.concatenation = p.stack
2122 p.alternation = p.concatenation.next
2123 p.group = p.alternation.next
2124 p.stack = p.group.next
2125
2126 // The first () inside a Testgroup group goes directly to the group
2127 if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2128 if p.unit == nil {
2129 return p.getErr(ErrConditionalExpression)
2130 }
2131
2132 p.group.addChild(p.unit)
2133 p.unit = nil
2134 }
2135 return nil
2136}
2137
2138// True if the group stack is empty.
2139func (p *parser) emptyStack() bool {
2140 return p.stack == nil
2141}
2142
2143// Start a new round for the parser state (in response to an open paren or string start)
2144func (p *parser) startGroup(openGroup *regexNode) {
2145 p.group = openGroup
2146 p.alternation = newRegexNode(ntAlternate, p.options)
2147 p.concatenation = newRegexNode(ntConcatenate, p.options)
2148}
2149
2150// Finish the current concatenation (in response to a |)
2151func (p *parser) addAlternate() {
2152 // The | parts inside a Testgroup group go directly to the group
2153
2154 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2155 p.group.addChild(p.concatenation.reverseLeft())
2156 } else {
2157 p.alternation.addChild(p.concatenation.reverseLeft())
2158 }
2159
2160 p.concatenation = newRegexNode(ntConcatenate, p.options)
2161}
2162
2163// For categorizing ascii characters.
2164
2165const (
2166 Q byte = 5 // quantifier
2167 S = 4 // ordinary stopper
2168 Z = 3 // ScanBlank stopper
2169 X = 2 // whitespace
2170 E = 1 // should be escaped
2171)
2172
2173var _category = []byte{
2174 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
2175 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2176 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
2177 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2178 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
2179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2180 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
2181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2182}
2183
2184func isSpace(ch rune) bool {
2185 return (ch <= ' ' && _category[ch] == X)
2186}
2187
2188// Returns true for those characters that terminate a string of ordinary chars.
2189func isSpecial(ch rune) bool {
2190 return (ch <= '|' && _category[ch] >= S)
2191}
2192
2193// Returns true for those characters that terminate a string of ordinary chars.
2194func isStopperX(ch rune) bool {
2195 return (ch <= '|' && _category[ch] >= X)
2196}
2197
2198// Returns true for those characters that begin a quantifier.
2199func isQuantifier(ch rune) bool {
2200 return (ch <= '{' && _category[ch] >= Q)
2201}
2202
2203func (p *parser) isTrueQuantifier() bool {
2204 nChars := p.charsRight()
2205 if nChars == 0 {
2206 return false
2207 }
2208
2209 startpos := p.textpos()
2210 ch := p.charAt(startpos)
2211 if ch != '{' {
2212 return ch <= '{' && _category[ch] >= Q
2213 }
2214
2215 //UGLY: this is ugly -- the original code was ugly too
2216 pos := startpos
2217 for {
2218 nChars--
2219 if nChars <= 0 {
2220 break
2221 }
2222 pos++
2223 ch = p.charAt(pos)
2224 if ch < '0' || ch > '9' {
2225 break
2226 }
2227 }
2228
2229 if nChars == 0 || pos-startpos == 1 {
2230 return false
2231 }
2232 if ch == '}' {
2233 return true
2234 }
2235 if ch != ',' {
2236 return false
2237 }
2238 for {
2239 nChars--
2240 if nChars <= 0 {
2241 break
2242 }
2243 pos++
2244 ch = p.charAt(pos)
2245 if ch < '0' || ch > '9' {
2246 break
2247 }
2248 }
2249
2250 return nChars > 0 && ch == '}'
2251}