1package syntax
   2
   3import (
   4	"fmt"
   5	"math"
   6	"os"
   7	"sort"
   8	"strconv"
   9	"unicode"
  10)
  11
  12type RegexOptions int32
  13
  14const (
  15	IgnoreCase              RegexOptions = 0x0001 // "i"
  16	Multiline                            = 0x0002 // "m"
  17	ExplicitCapture                      = 0x0004 // "n"
  18	Compiled                             = 0x0008 // "c"
  19	Singleline                           = 0x0010 // "s"
  20	IgnorePatternWhitespace              = 0x0020 // "x"
  21	RightToLeft                          = 0x0040 // "r"
  22	Debug                                = 0x0080 // "d"
  23	ECMAScript                           = 0x0100 // "e"
  24	RE2                                  = 0x0200 // RE2 compat mode
  25	Unicode                              = 0x0400 // "u"
  26)
  27
  28func optionFromCode(ch rune) RegexOptions {
  29	// case-insensitive
  30	switch ch {
  31	case 'i', 'I':
  32		return IgnoreCase
  33	case 'r', 'R':
  34		return RightToLeft
  35	case 'm', 'M':
  36		return Multiline
  37	case 'n', 'N':
  38		return ExplicitCapture
  39	case 's', 'S':
  40		return Singleline
  41	case 'x', 'X':
  42		return IgnorePatternWhitespace
  43	case 'd', 'D':
  44		return Debug
  45	case 'e', 'E':
  46		return ECMAScript
  47	case 'u', 'U':
  48		return Unicode
  49	default:
  50		return 0
  51	}
  52}
  53
  54// An Error describes a failure to parse a regular expression
  55// and gives the offending expression.
  56type Error struct {
  57	Code ErrorCode
  58	Expr string
  59	Args []interface{}
  60}
  61
  62func (e *Error) Error() string {
  63	if len(e.Args) == 0 {
  64		return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
  65	}
  66	return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
  67}
  68
  69// An ErrorCode describes a failure to parse a regular expression.
  70type ErrorCode string
  71
  72const (
  73	// internal issue
  74	ErrInternalError ErrorCode = "regexp/syntax: internal error"
  75	// Parser errors
  76	ErrUnterminatedComment        = "unterminated comment"
  77	ErrInvalidCharRange           = "invalid character class range"
  78	ErrInvalidRepeatSize          = "invalid repeat count"
  79	ErrInvalidUTF8                = "invalid UTF-8"
  80	ErrCaptureGroupOutOfRange     = "capture group number out of range"
  81	ErrUnexpectedParen            = "unexpected )"
  82	ErrMissingParen               = "missing closing )"
  83	ErrMissingBrace               = "missing closing }"
  84	ErrInvalidRepeatOp            = "invalid nested repetition operator"
  85	ErrMissingRepeatArgument      = "missing argument to repetition operator"
  86	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
  87	ErrTooManyAlternates          = "too many | in (?()|)"
  88	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
  89	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
  90	ErrCapNumNotZero              = "capture number cannot be zero"
  91	ErrUndefinedBackRef           = "reference to undefined group number %v"
  92	ErrUndefinedNameRef           = "reference to undefined group name %v"
  93	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
  94	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
  95	ErrMalformedReference         = "(?(%v) ) malformed"
  96	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
  97	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
  98	ErrMalformedSlashP            = "malformed \\p{X} character escape"
  99	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
 100	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
 101	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
 102	ErrMissingControl             = "missing control character"
 103	ErrUnrecognizedControl        = "unrecognized control character"
 104	ErrTooFewHex                  = "insufficient hexadecimal digits"
 105	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
 106	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
 107	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
 108	ErrUnterminatedBracket        = "unterminated [] set"
 109	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
 110	ErrReversedCharRange          = "[%c-%c] range in reverse order"
 111)
 112
 113func (e ErrorCode) String() string {
 114	return string(e)
 115}
 116
 117type parser struct {
 118	stack         *regexNode
 119	group         *regexNode
 120	alternation   *regexNode
 121	concatenation *regexNode
 122	unit          *regexNode
 123
 124	patternRaw string
 125	pattern    []rune
 126
 127	currentPos  int
 128	specialCase *unicode.SpecialCase
 129
 130	autocap  int
 131	capcount int
 132	captop   int
 133	capsize  int
 134
 135	caps     map[int]int
 136	capnames map[string]int
 137
 138	capnumlist  []int
 139	capnamelist []string
 140
 141	options         RegexOptions
 142	optionsStack    []RegexOptions
 143	ignoreNextParen bool
 144}
 145
 146const (
 147	maxValueDiv10 int = math.MaxInt32 / 10
 148	maxValueMod10     = math.MaxInt32 % 10
 149)
 150
 151// Parse converts a regex string into a parse tree
 152func Parse(re string, op RegexOptions) (*RegexTree, error) {
 153	p := parser{
 154		options: op,
 155		caps:    make(map[int]int),
 156	}
 157	p.setPattern(re)
 158
 159	if err := p.countCaptures(); err != nil {
 160		return nil, err
 161	}
 162
 163	p.reset(op)
 164	root, err := p.scanRegex()
 165
 166	if err != nil {
 167		return nil, err
 168	}
 169	tree := &RegexTree{
 170		root:       root,
 171		caps:       p.caps,
 172		capnumlist: p.capnumlist,
 173		captop:     p.captop,
 174		Capnames:   p.capnames,
 175		Caplist:    p.capnamelist,
 176		options:    op,
 177	}
 178
 179	if tree.options&Debug > 0 {
 180		os.Stdout.WriteString(tree.Dump())
 181	}
 182
 183	return tree, nil
 184}
 185
 186func (p *parser) setPattern(pattern string) {
 187	p.patternRaw = pattern
 188	p.pattern = make([]rune, 0, len(pattern))
 189
 190	//populate our rune array to handle utf8 encoding
 191	for _, r := range pattern {
 192		p.pattern = append(p.pattern, r)
 193	}
 194}
 195func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
 196	return &Error{Code: code, Expr: p.patternRaw, Args: args}
 197}
 198
 199func (p *parser) noteCaptureSlot(i, pos int) {
 200	if _, ok := p.caps[i]; !ok {
 201		// the rhs of the hashtable isn't used in the parser
 202		p.caps[i] = pos
 203		p.capcount++
 204
 205		if p.captop <= i {
 206			if i == math.MaxInt32 {
 207				p.captop = i
 208			} else {
 209				p.captop = i + 1
 210			}
 211		}
 212	}
 213}
 214
 215func (p *parser) noteCaptureName(name string, pos int) {
 216	if p.capnames == nil {
 217		p.capnames = make(map[string]int)
 218	}
 219
 220	if _, ok := p.capnames[name]; !ok {
 221		p.capnames[name] = pos
 222		p.capnamelist = append(p.capnamelist, name)
 223	}
 224}
 225
 226func (p *parser) assignNameSlots() {
 227	if p.capnames != nil {
 228		for _, name := range p.capnamelist {
 229			for p.isCaptureSlot(p.autocap) {
 230				p.autocap++
 231			}
 232			pos := p.capnames[name]
 233			p.capnames[name] = p.autocap
 234			p.noteCaptureSlot(p.autocap, pos)
 235
 236			p.autocap++
 237		}
 238	}
 239
 240	// if the caps array has at least one gap, construct the list of used slots
 241	if p.capcount < p.captop {
 242		p.capnumlist = make([]int, p.capcount)
 243		i := 0
 244
 245		for k := range p.caps {
 246			p.capnumlist[i] = k
 247			i++
 248		}
 249
 250		sort.Ints(p.capnumlist)
 251	}
 252
 253	// merge capsnumlist into capnamelist
 254	if p.capnames != nil || p.capnumlist != nil {
 255		var oldcapnamelist []string
 256		var next int
 257		var k int
 258
 259		if p.capnames == nil {
 260			oldcapnamelist = nil
 261			p.capnames = make(map[string]int)
 262			p.capnamelist = []string{}
 263			next = -1
 264		} else {
 265			oldcapnamelist = p.capnamelist
 266			p.capnamelist = []string{}
 267			next = p.capnames[oldcapnamelist[0]]
 268		}
 269
 270		for i := 0; i < p.capcount; i++ {
 271			j := i
 272			if p.capnumlist != nil {
 273				j = p.capnumlist[i]
 274			}
 275
 276			if next == j {
 277				p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
 278				k++
 279
 280				if k == len(oldcapnamelist) {
 281					next = -1
 282				} else {
 283					next = p.capnames[oldcapnamelist[k]]
 284				}
 285
 286			} else {
 287				//feature: culture?
 288				str := strconv.Itoa(j)
 289				p.capnamelist = append(p.capnamelist, str)
 290				p.capnames[str] = j
 291			}
 292		}
 293	}
 294}
 295
 296func (p *parser) consumeAutocap() int {
 297	r := p.autocap
 298	p.autocap++
 299	return r
 300}
 301
 302// CountCaptures is a prescanner for deducing the slots used for
 303// captures by doing a partial tokenization of the pattern.
 304func (p *parser) countCaptures() error {
 305	var ch rune
 306
 307	p.noteCaptureSlot(0, 0)
 308
 309	p.autocap = 1
 310
 311	for p.charsRight() > 0 {
 312		pos := p.textpos()
 313		ch = p.moveRightGetChar()
 314		switch ch {
 315		case '\\':
 316			if p.charsRight() > 0 {
 317				p.scanBackslash(true)
 318			}
 319
 320		case '#':
 321			if p.useOptionX() {
 322				p.moveLeft()
 323				p.scanBlank()
 324			}
 325
 326		case '[':
 327			p.scanCharSet(false, true)
 328
 329		case ')':
 330			if !p.emptyOptionsStack() {
 331				p.popOptions()
 332			}
 333
 334		case '(':
 335			if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
 336				p.moveLeft()
 337				p.scanBlank()
 338			} else {
 339				p.pushOptions()
 340				if p.charsRight() > 0 && p.rightChar(0) == '?' {
 341					// we have (?...
 342					p.moveRight(1)
 343
 344					if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
 345						// named group: (?<... or (?'...
 346
 347						p.moveRight(1)
 348						ch = p.rightChar(0)
 349
 350						if ch != '0' && IsWordChar(ch) {
 351							if ch >= '1' && ch <= '9' {
 352								dec, err := p.scanDecimal()
 353								if err != nil {
 354									return err
 355								}
 356								p.noteCaptureSlot(dec, pos)
 357							} else {
 358								p.noteCaptureName(p.scanCapname(), pos)
 359							}
 360						}
 361					} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
 362						// RE2-compat (?P<)
 363						p.moveRight(2)
 364						ch = p.rightChar(0)
 365						if IsWordChar(ch) {
 366							p.noteCaptureName(p.scanCapname(), pos)
 367						}
 368
 369					} else {
 370						// (?...
 371
 372						// get the options if it's an option construct (?cimsx-cimsx...)
 373						p.scanOptions()
 374
 375						if p.charsRight() > 0 {
 376							if p.rightChar(0) == ')' {
 377								// (?cimsx-cimsx)
 378								p.moveRight(1)
 379								p.popKeepOptions()
 380							} else if p.rightChar(0) == '(' {
 381								// alternation construct: (?(foo)yes|no)
 382								// ignore the next paren so we don't capture the condition
 383								p.ignoreNextParen = true
 384
 385								// break from here so we don't reset ignoreNextParen
 386								continue
 387							}
 388						}
 389					}
 390				} else {
 391					if !p.useOptionN() && !p.ignoreNextParen {
 392						p.noteCaptureSlot(p.consumeAutocap(), pos)
 393					}
 394				}
 395			}
 396
 397			p.ignoreNextParen = false
 398
 399		}
 400	}
 401
 402	p.assignNameSlots()
 403	return nil
 404}
 405
 406func (p *parser) reset(topopts RegexOptions) {
 407	p.currentPos = 0
 408	p.autocap = 1
 409	p.ignoreNextParen = false
 410
 411	if len(p.optionsStack) > 0 {
 412		p.optionsStack = p.optionsStack[:0]
 413	}
 414
 415	p.options = topopts
 416	p.stack = nil
 417}
 418
 419func (p *parser) scanRegex() (*regexNode, error) {
 420	ch := '@' // nonspecial ch, means at beginning
 421	isQuant := false
 422
 423	p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
 424
 425	for p.charsRight() > 0 {
 426		wasPrevQuantifier := isQuant
 427		isQuant = false
 428
 429		if err := p.scanBlank(); err != nil {
 430			return nil, err
 431		}
 432
 433		startpos := p.textpos()
 434
 435		// move past all of the normal characters.  We'll stop when we hit some kind of control character,
 436		// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
 437		if p.useOptionX() {
 438			for p.charsRight() > 0 {
 439				ch = p.rightChar(0)
 440				//UGLY: clean up, this is ugly
 441				if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
 442					break
 443				}
 444				p.moveRight(1)
 445			}
 446		} else {
 447			for p.charsRight() > 0 {
 448				ch = p.rightChar(0)
 449				if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
 450					break
 451				}
 452				p.moveRight(1)
 453			}
 454		}
 455
 456		endpos := p.textpos()
 457
 458		p.scanBlank()
 459
 460		if p.charsRight() == 0 {
 461			ch = '!' // nonspecial, means at end
 462		} else if ch = p.rightChar(0); isSpecial(ch) {
 463			isQuant = isQuantifier(ch)
 464			p.moveRight(1)
 465		} else {
 466			ch = ' ' // nonspecial, means at ordinary char
 467		}
 468
 469		if startpos < endpos {
 470			cchUnquantified := endpos - startpos
 471			if isQuant {
 472				cchUnquantified--
 473			}
 474			wasPrevQuantifier = false
 475
 476			if cchUnquantified > 0 {
 477				p.addToConcatenate(startpos, cchUnquantified, false)
 478			}
 479
 480			if isQuant {
 481				p.addUnitOne(p.charAt(endpos - 1))
 482			}
 483		}
 484
 485		switch ch {
 486		case '!':
 487			goto BreakOuterScan
 488
 489		case ' ':
 490			goto ContinueOuterScan
 491
 492		case '[':
 493			cc, err := p.scanCharSet(p.useOptionI(), false)
 494			if err != nil {
 495				return nil, err
 496			}
 497			p.addUnitSet(cc)
 498
 499		case '(':
 500			p.pushOptions()
 501
 502			if grouper, err := p.scanGroupOpen(); err != nil {
 503				return nil, err
 504			} else if grouper == nil {
 505				p.popKeepOptions()
 506			} else {
 507				p.pushGroup()
 508				p.startGroup(grouper)
 509			}
 510
 511			continue
 512
 513		case '|':
 514			p.addAlternate()
 515			goto ContinueOuterScan
 516
 517		case ')':
 518			if p.emptyStack() {
 519				return nil, p.getErr(ErrUnexpectedParen)
 520			}
 521
 522			if err := p.addGroup(); err != nil {
 523				return nil, err
 524			}
 525			if err := p.popGroup(); err != nil {
 526				return nil, err
 527			}
 528			p.popOptions()
 529
 530			if p.unit == nil {
 531				goto ContinueOuterScan
 532			}
 533
 534		case '\\':
 535			n, err := p.scanBackslash(false)
 536			if err != nil {
 537				return nil, err
 538			}
 539			p.addUnitNode(n)
 540
 541		case '^':
 542			if p.useOptionM() {
 543				p.addUnitType(ntBol)
 544			} else {
 545				p.addUnitType(ntBeginning)
 546			}
 547
 548		case '$':
 549			if p.useOptionM() {
 550				p.addUnitType(ntEol)
 551			} else {
 552				p.addUnitType(ntEndZ)
 553			}
 554
 555		case '.':
 556			if p.useOptionE() {
 557				p.addUnitSet(ECMAAnyClass())
 558			} else if p.useOptionS() {
 559				p.addUnitSet(AnyClass())
 560			} else {
 561				p.addUnitNotone('\n')
 562			}
 563
 564		case '{', '*', '+', '?':
 565			if p.unit == nil {
 566				if wasPrevQuantifier {
 567					return nil, p.getErr(ErrInvalidRepeatOp)
 568				} else {
 569					return nil, p.getErr(ErrMissingRepeatArgument)
 570				}
 571			}
 572			p.moveLeft()
 573
 574		default:
 575			return nil, p.getErr(ErrInternalError)
 576		}
 577
 578		if err := p.scanBlank(); err != nil {
 579			return nil, err
 580		}
 581
 582		if p.charsRight() > 0 {
 583			isQuant = p.isTrueQuantifier()
 584		}
 585		if p.charsRight() == 0 || !isQuant {
 586			//maintain odd C# assignment order -- not sure if required, could clean up?
 587			p.addConcatenate()
 588			goto ContinueOuterScan
 589		}
 590
 591		ch = p.moveRightGetChar()
 592
 593		// Handle quantifiers
 594		for p.unit != nil {
 595			var min, max int
 596			var lazy bool
 597
 598			switch ch {
 599			case '*':
 600				min = 0
 601				max = math.MaxInt32
 602
 603			case '?':
 604				min = 0
 605				max = 1
 606
 607			case '+':
 608				min = 1
 609				max = math.MaxInt32
 610
 611			case '{':
 612				{
 613					var err error
 614					startpos = p.textpos()
 615					if min, err = p.scanDecimal(); err != nil {
 616						return nil, err
 617					}
 618					max = min
 619					if startpos < p.textpos() {
 620						if p.charsRight() > 0 && p.rightChar(0) == ',' {
 621							p.moveRight(1)
 622							if p.charsRight() == 0 || p.rightChar(0) == '}' {
 623								max = math.MaxInt32
 624							} else {
 625								if max, err = p.scanDecimal(); err != nil {
 626									return nil, err
 627								}
 628							}
 629						}
 630					}
 631
 632					if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
 633						p.addConcatenate()
 634						p.textto(startpos - 1)
 635						goto ContinueOuterScan
 636					}
 637				}
 638
 639			default:
 640				return nil, p.getErr(ErrInternalError)
 641			}
 642
 643			if err := p.scanBlank(); err != nil {
 644				return nil, err
 645			}
 646
 647			if p.charsRight() == 0 || p.rightChar(0) != '?' {
 648				lazy = false
 649			} else {
 650				p.moveRight(1)
 651				lazy = true
 652			}
 653
 654			if min > max {
 655				return nil, p.getErr(ErrInvalidRepeatSize)
 656			}
 657
 658			p.addConcatenate3(lazy, min, max)
 659		}
 660
 661	ContinueOuterScan:
 662	}
 663
 664BreakOuterScan:
 665	;
 666
 667	if !p.emptyStack() {
 668		return nil, p.getErr(ErrMissingParen)
 669	}
 670
 671	if err := p.addGroup(); err != nil {
 672		return nil, err
 673	}
 674
 675	return p.unit, nil
 676
 677}
 678
 679/*
 680 * Simple parsing for replacement patterns
 681 */
 682func (p *parser) scanReplacement() (*regexNode, error) {
 683	var c, startpos int
 684
 685	p.concatenation = newRegexNode(ntConcatenate, p.options)
 686
 687	for {
 688		c = p.charsRight()
 689		if c == 0 {
 690			break
 691		}
 692
 693		startpos = p.textpos()
 694
 695		for c > 0 && p.rightChar(0) != '$' {
 696			p.moveRight(1)
 697			c--
 698		}
 699
 700		p.addToConcatenate(startpos, p.textpos()-startpos, true)
 701
 702		if c > 0 {
 703			if p.moveRightGetChar() == '$' {
 704				n, err := p.scanDollar()
 705				if err != nil {
 706					return nil, err
 707				}
 708				p.addUnitNode(n)
 709			}
 710			p.addConcatenate()
 711		}
 712	}
 713
 714	return p.concatenation, nil
 715}
 716
 717/*
 718 * Scans $ patterns recognized within replacement patterns
 719 */
 720func (p *parser) scanDollar() (*regexNode, error) {
 721	if p.charsRight() == 0 {
 722		return newRegexNodeCh(ntOne, p.options, '$'), nil
 723	}
 724
 725	ch := p.rightChar(0)
 726	angled := false
 727	backpos := p.textpos()
 728	lastEndPos := backpos
 729
 730	// Note angle
 731
 732	if ch == '{' && p.charsRight() > 1 {
 733		angled = true
 734		p.moveRight(1)
 735		ch = p.rightChar(0)
 736	}
 737
 738	// Try to parse backreference: \1 or \{1} or \{cap}
 739
 740	if ch >= '0' && ch <= '9' {
 741		if !angled && p.useOptionE() {
 742			capnum := -1
 743			newcapnum := int(ch - '0')
 744			p.moveRight(1)
 745			if p.isCaptureSlot(newcapnum) {
 746				capnum = newcapnum
 747				lastEndPos = p.textpos()
 748			}
 749
 750			for p.charsRight() > 0 {
 751				ch = p.rightChar(0)
 752				if ch < '0' || ch > '9' {
 753					break
 754				}
 755				digit := int(ch - '0')
 756				if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
 757					return nil, p.getErr(ErrCaptureGroupOutOfRange)
 758				}
 759
 760				newcapnum = newcapnum*10 + digit
 761
 762				p.moveRight(1)
 763				if p.isCaptureSlot(newcapnum) {
 764					capnum = newcapnum
 765					lastEndPos = p.textpos()
 766				}
 767			}
 768			p.textto(lastEndPos)
 769			if capnum >= 0 {
 770				return newRegexNodeM(ntRef, p.options, capnum), nil
 771			}
 772		} else {
 773			capnum, err := p.scanDecimal()
 774			if err != nil {
 775				return nil, err
 776			}
 777			if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
 778				if p.isCaptureSlot(capnum) {
 779					return newRegexNodeM(ntRef, p.options, capnum), nil
 780				}
 781			}
 782		}
 783	} else if angled && IsWordChar(ch) {
 784		capname := p.scanCapname()
 785
 786		if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
 787			if p.isCaptureName(capname) {
 788				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
 789			}
 790		}
 791	} else if !angled {
 792		capnum := 1
 793
 794		switch ch {
 795		case '$':
 796			p.moveRight(1)
 797			return newRegexNodeCh(ntOne, p.options, '$'), nil
 798		case '&':
 799			capnum = 0
 800		case '`':
 801			capnum = replaceLeftPortion
 802		case '\'':
 803			capnum = replaceRightPortion
 804		case '+':
 805			capnum = replaceLastGroup
 806		case '_':
 807			capnum = replaceWholeString
 808		}
 809
 810		if capnum != 1 {
 811			p.moveRight(1)
 812			return newRegexNodeM(ntRef, p.options, capnum), nil
 813		}
 814	}
 815
 816	// unrecognized $: literalize
 817
 818	p.textto(backpos)
 819	return newRegexNodeCh(ntOne, p.options, '$'), nil
 820}
 821
 822// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
 823// a RegexNode for the type of group scanned, or nil if the group
 824// simply changed options (?cimsx-cimsx) or was a comment (#...).
 825func (p *parser) scanGroupOpen() (*regexNode, error) {
 826	var ch rune
 827	var nt nodeType
 828	var err error
 829	close := '>'
 830	start := p.textpos()
 831
 832	// just return a RegexNode if we have:
 833	// 1. "(" followed by nothing
 834	// 2. "(x" where x != ?
 835	// 3. "(?)"
 836	if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
 837		if p.useOptionN() || p.ignoreNextParen {
 838			p.ignoreNextParen = false
 839			return newRegexNode(ntGroup, p.options), nil
 840		}
 841		return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
 842	}
 843
 844	p.moveRight(1)
 845
 846	for {
 847		if p.charsRight() == 0 {
 848			break
 849		}
 850
 851		switch ch = p.moveRightGetChar(); ch {
 852		case ':':
 853			nt = ntGroup
 854
 855		case '=':
 856			p.options &= ^RightToLeft
 857			nt = ntRequire
 858
 859		case '!':
 860			p.options &= ^RightToLeft
 861			nt = ntPrevent
 862
 863		case '>':
 864			nt = ntGreedy
 865
 866		case '\'':
 867			close = '\''
 868			fallthrough
 869
 870		case '<':
 871			if p.charsRight() == 0 {
 872				goto BreakRecognize
 873			}
 874
 875			switch ch = p.moveRightGetChar(); ch {
 876			case '=':
 877				if close == '\'' {
 878					goto BreakRecognize
 879				}
 880
 881				p.options |= RightToLeft
 882				nt = ntRequire
 883
 884			case '!':
 885				if close == '\'' {
 886					goto BreakRecognize
 887				}
 888
 889				p.options |= RightToLeft
 890				nt = ntPrevent
 891
 892			default:
 893				p.moveLeft()
 894				capnum := -1
 895				uncapnum := -1
 896				proceed := false
 897
 898				// grab part before -
 899
 900				if ch >= '0' && ch <= '9' {
 901					if capnum, err = p.scanDecimal(); err != nil {
 902						return nil, err
 903					}
 904
 905					if !p.isCaptureSlot(capnum) {
 906						capnum = -1
 907					}
 908
 909					// check if we have bogus characters after the number
 910					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
 911						return nil, p.getErr(ErrInvalidGroupName)
 912					}
 913					if capnum == 0 {
 914						return nil, p.getErr(ErrCapNumNotZero)
 915					}
 916				} else if IsWordChar(ch) {
 917					capname := p.scanCapname()
 918
 919					if p.isCaptureName(capname) {
 920						capnum = p.captureSlotFromName(capname)
 921					}
 922
 923					// check if we have bogus character after the name
 924					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
 925						return nil, p.getErr(ErrInvalidGroupName)
 926					}
 927				} else if ch == '-' {
 928					proceed = true
 929				} else {
 930					// bad group name - starts with something other than a word character and isn't a number
 931					return nil, p.getErr(ErrInvalidGroupName)
 932				}
 933
 934				// grab part after - if any
 935
 936				if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
 937					p.moveRight(1)
 938
 939					//no more chars left, no closing char, etc
 940					if p.charsRight() == 0 {
 941						return nil, p.getErr(ErrInvalidGroupName)
 942					}
 943
 944					ch = p.rightChar(0)
 945					if ch >= '0' && ch <= '9' {
 946						if uncapnum, err = p.scanDecimal(); err != nil {
 947							return nil, err
 948						}
 949
 950						if !p.isCaptureSlot(uncapnum) {
 951							return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
 952						}
 953
 954						// check if we have bogus characters after the number
 955						if p.charsRight() > 0 && p.rightChar(0) != close {
 956							return nil, p.getErr(ErrInvalidGroupName)
 957						}
 958					} else if IsWordChar(ch) {
 959						uncapname := p.scanCapname()
 960
 961						if !p.isCaptureName(uncapname) {
 962							return nil, p.getErr(ErrUndefinedNameRef, uncapname)
 963						}
 964						uncapnum = p.captureSlotFromName(uncapname)
 965
 966						// check if we have bogus character after the name
 967						if p.charsRight() > 0 && p.rightChar(0) != close {
 968							return nil, p.getErr(ErrInvalidGroupName)
 969						}
 970					} else {
 971						// bad group name - starts with something other than a word character and isn't a number
 972						return nil, p.getErr(ErrInvalidGroupName)
 973					}
 974				}
 975
 976				// actually make the node
 977
 978				if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
 979					return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
 980				}
 981				goto BreakRecognize
 982			}
 983
 984		case '(':
 985			// alternation construct (?(...) | )
 986
 987			parenPos := p.textpos()
 988			if p.charsRight() > 0 {
 989				ch = p.rightChar(0)
 990
 991				// check if the alternation condition is a backref
 992				if ch >= '0' && ch <= '9' {
 993					var capnum int
 994					if capnum, err = p.scanDecimal(); err != nil {
 995						return nil, err
 996					}
 997					if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
 998						if p.isCaptureSlot(capnum) {
 999							return newRegexNodeM(ntTestref, p.options, capnum), nil
1000						}
1001						return nil, p.getErr(ErrUndefinedReference, capnum)
1002					}
1003
1004					return nil, p.getErr(ErrMalformedReference, capnum)
1005
1006				} else if IsWordChar(ch) {
1007					capname := p.scanCapname()
1008
1009					if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1010						return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1011					}
1012				}
1013			}
1014			// not a backref
1015			nt = ntTestgroup
1016			p.textto(parenPos - 1)   // jump to the start of the parentheses
1017			p.ignoreNextParen = true // but make sure we don't try to capture the insides
1018
1019			charsRight := p.charsRight()
1020			if charsRight >= 3 && p.rightChar(1) == '?' {
1021				rightchar2 := p.rightChar(2)
1022				// disallow comments in the condition
1023				if rightchar2 == '#' {
1024					return nil, p.getErr(ErrAlternationCantHaveComment)
1025				}
1026
1027				// disallow named capture group (?<..>..) in the condition
1028				if rightchar2 == '\'' {
1029					return nil, p.getErr(ErrAlternationCantCapture)
1030				}
1031
1032				if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1033					return nil, p.getErr(ErrAlternationCantCapture)
1034				}
1035			}
1036
1037		case 'P':
1038			if p.useRE2() {
1039				// support for P<name> syntax
1040				if p.charsRight() < 3 {
1041					goto BreakRecognize
1042				}
1043
1044				ch = p.moveRightGetChar()
1045				if ch != '<' {
1046					goto BreakRecognize
1047				}
1048
1049				ch = p.moveRightGetChar()
1050				p.moveLeft()
1051
1052				if IsWordChar(ch) {
1053					capnum := -1
1054					capname := p.scanCapname()
1055
1056					if p.isCaptureName(capname) {
1057						capnum = p.captureSlotFromName(capname)
1058					}
1059
1060					// check if we have bogus character after the name
1061					if p.charsRight() > 0 && p.rightChar(0) != '>' {
1062						return nil, p.getErr(ErrInvalidGroupName)
1063					}
1064
1065					// actually make the node
1066
1067					if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1068						return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1069					}
1070					goto BreakRecognize
1071
1072				} else {
1073					// bad group name - starts with something other than a word character and isn't a number
1074					return nil, p.getErr(ErrInvalidGroupName)
1075				}
1076			}
1077			// if we're not using RE2 compat mode then
1078			// we just behave like normal
1079			fallthrough
1080
1081		default:
1082			p.moveLeft()
1083
1084			nt = ntGroup
1085			// disallow options in the children of a testgroup node
1086			if p.group.t != ntTestgroup {
1087				p.scanOptions()
1088			}
1089			if p.charsRight() == 0 {
1090				goto BreakRecognize
1091			}
1092
1093			if ch = p.moveRightGetChar(); ch == ')' {
1094				return nil, nil
1095			}
1096
1097			if ch != ':' {
1098				goto BreakRecognize
1099			}
1100
1101		}
1102
1103		return newRegexNode(nt, p.options), nil
1104	}
1105
1106BreakRecognize:
1107
1108	// break Recognize comes here
1109
1110	return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1111}
1112
1113// scans backslash specials and basics
1114func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1115
1116	if p.charsRight() == 0 {
1117		return nil, p.getErr(ErrIllegalEndEscape)
1118	}
1119
1120	switch ch := p.rightChar(0); ch {
1121	case 'b', 'B', 'A', 'G', 'Z', 'z':
1122		p.moveRight(1)
1123		return newRegexNode(p.typeFromCode(ch), p.options), nil
1124
1125	case 'w':
1126		p.moveRight(1)
1127		if p.useOptionE() || p.useRE2() {
1128			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1129		}
1130		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1131
1132	case 'W':
1133		p.moveRight(1)
1134		if p.useOptionE() || p.useRE2() {
1135			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1136		}
1137		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1138
1139	case 's':
1140		p.moveRight(1)
1141		if p.useOptionE() {
1142			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1143		} else if p.useRE2() {
1144			return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
1145		}
1146		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1147
1148	case 'S':
1149		p.moveRight(1)
1150		if p.useOptionE() {
1151			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1152		} else if p.useRE2() {
1153			return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
1154		}
1155		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1156
1157	case 'd':
1158		p.moveRight(1)
1159		if p.useOptionE() || p.useRE2() {
1160			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1161		}
1162		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1163
1164	case 'D':
1165		p.moveRight(1)
1166		if p.useOptionE() || p.useRE2() {
1167			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1168		}
1169		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1170
1171	case 'p', 'P':
1172		p.moveRight(1)
1173		prop, err := p.parseProperty()
1174		if err != nil {
1175			return nil, err
1176		}
1177		cc := &CharSet{}
1178		cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1179		if p.useOptionI() {
1180			cc.addLowercase()
1181		}
1182
1183		return newRegexNodeSet(ntSet, p.options, cc), nil
1184
1185	default:
1186		return p.scanBasicBackslash(scanOnly)
1187	}
1188}
1189
1190// Scans \-style backreferences and character escapes
1191func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1192	if p.charsRight() == 0 {
1193		return nil, p.getErr(ErrIllegalEndEscape)
1194	}
1195	angled := false
1196	k := false
1197	close := '\x00'
1198
1199	backpos := p.textpos()
1200	ch := p.rightChar(0)
1201
1202	// Allow \k<foo> instead of \<foo>, which is now deprecated.
1203
1204	// According to ECMAScript specification, \k<name> is only parsed as a named group reference if
1205	// there is at least one group name in the regexp.
1206	// See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
1207	// Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
1208	if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
1209		if p.charsRight() >= 2 {
1210			p.moveRight(1)
1211			ch = p.moveRightGetChar()
1212
1213			if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
1214				angled = true
1215				if ch == '\'' {
1216					close = '\''
1217				} else {
1218					close = '>'
1219				}
1220			}
1221		}
1222
1223		if !angled || p.charsRight() <= 0 {
1224			return nil, p.getErr(ErrMalformedNameRef)
1225		}
1226
1227		ch = p.rightChar(0)
1228		k = true
1229
1230	} else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1231		angled = true
1232		if ch == '\'' {
1233			close = '\''
1234		} else {
1235			close = '>'
1236		}
1237
1238		p.moveRight(1)
1239		ch = p.rightChar(0)
1240	}
1241
1242	// Try to parse backreference: \<1> or \<cap>
1243
1244	if angled && ch >= '0' && ch <= '9' {
1245		capnum, err := p.scanDecimal()
1246		if err != nil {
1247			return nil, err
1248		}
1249
1250		if p.charsRight() > 0 && p.moveRightGetChar() == close {
1251			if p.isCaptureSlot(capnum) {
1252				return newRegexNodeM(ntRef, p.options, capnum), nil
1253			}
1254			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1255		}
1256	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1257		capnum, err := p.scanDecimal()
1258		if err != nil {
1259			return nil, err
1260		}
1261
1262		if scanOnly {
1263			return nil, nil
1264		}
1265
1266		if p.isCaptureSlot(capnum) {
1267			return newRegexNodeM(ntRef, p.options, capnum), nil
1268		}
1269		if capnum <= 9 && !p.useOptionE() {
1270			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1271		}
1272
1273	} else if angled {
1274		capname := p.scanCapname()
1275
1276		if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
1277
1278			if scanOnly {
1279				return nil, nil
1280			}
1281
1282			if p.isCaptureName(capname) {
1283				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1284			}
1285			return nil, p.getErr(ErrUndefinedNameRef, capname)
1286		} else {
1287			if k {
1288				return nil, p.getErr(ErrMalformedNameRef)
1289			}
1290		}
1291	}
1292
1293	// Not backreference: must be char code
1294
1295	p.textto(backpos)
1296	ch, err := p.scanCharEscape()
1297	if err != nil {
1298		return nil, err
1299	}
1300
1301	if scanOnly {
1302		return nil, nil
1303	}
1304
1305	if p.useOptionI() {
1306		ch = unicode.ToLower(ch)
1307	}
1308
1309	return newRegexNodeCh(ntOne, p.options, ch), nil
1310}
1311
1312// Scans X for \p{X} or \P{X}
1313func (p *parser) parseProperty() (string, error) {
1314	if p.charsRight() < 3 {
1315		return "", p.getErr(ErrIncompleteSlashP)
1316	}
1317	ch := p.moveRightGetChar()
1318	if ch != '{' {
1319		return "", p.getErr(ErrMalformedSlashP)
1320	}
1321
1322	startpos := p.textpos()
1323	for p.charsRight() > 0 {
1324		ch = p.moveRightGetChar()
1325		if !(IsWordChar(ch) || ch == '-') {
1326			p.moveLeft()
1327			break
1328		}
1329	}
1330	capname := string(p.pattern[startpos:p.textpos()])
1331
1332	if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1333		return "", p.getErr(ErrIncompleteSlashP)
1334	}
1335
1336	if !isValidUnicodeCat(capname) {
1337		return "", p.getErr(ErrUnknownSlashP, capname)
1338	}
1339
1340	return capname, nil
1341}
1342
1343// Returns ReNode type for zero-length assertions with a \ code.
1344func (p *parser) typeFromCode(ch rune) nodeType {
1345	switch ch {
1346	case 'b':
1347		if p.useOptionE() {
1348			return ntECMABoundary
1349		}
1350		return ntBoundary
1351	case 'B':
1352		if p.useOptionE() {
1353			return ntNonECMABoundary
1354		}
1355		return ntNonboundary
1356	case 'A':
1357		return ntBeginning
1358	case 'G':
1359		return ntStart
1360	case 'Z':
1361		return ntEndZ
1362	case 'z':
1363		return ntEnd
1364	default:
1365		return ntNothing
1366	}
1367}
1368
1369// Scans whitespace or x-mode comments.
1370func (p *parser) scanBlank() error {
1371	if p.useOptionX() {
1372		for {
1373			for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1374				p.moveRight(1)
1375			}
1376
1377			if p.charsRight() == 0 {
1378				break
1379			}
1380
1381			if p.rightChar(0) == '#' {
1382				for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1383					p.moveRight(1)
1384				}
1385			} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1386				p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1387				for p.charsRight() > 0 && p.rightChar(0) != ')' {
1388					p.moveRight(1)
1389				}
1390				if p.charsRight() == 0 {
1391					return p.getErr(ErrUnterminatedComment)
1392				}
1393				p.moveRight(1)
1394			} else {
1395				break
1396			}
1397		}
1398	} else {
1399		for {
1400			if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1401				p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1402				return nil
1403			}
1404
1405			for p.charsRight() > 0 && p.rightChar(0) != ')' {
1406				p.moveRight(1)
1407			}
1408			if p.charsRight() == 0 {
1409				return p.getErr(ErrUnterminatedComment)
1410			}
1411			p.moveRight(1)
1412		}
1413	}
1414	return nil
1415}
1416
1417func (p *parser) scanCapname() string {
1418	startpos := p.textpos()
1419
1420	for p.charsRight() > 0 {
1421		if !IsWordChar(p.moveRightGetChar()) {
1422			p.moveLeft()
1423			break
1424		}
1425	}
1426
1427	return string(p.pattern[startpos:p.textpos()])
1428}
1429
1430//Scans contents of [] (not including []'s), and converts to a set.
1431func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1432	ch := '\x00'
1433	chPrev := '\x00'
1434	inRange := false
1435	firstChar := true
1436	closed := false
1437
1438	var cc *CharSet
1439	if !scanOnly {
1440		cc = &CharSet{}
1441	}
1442
1443	if p.charsRight() > 0 && p.rightChar(0) == '^' {
1444		p.moveRight(1)
1445		if !scanOnly {
1446			cc.negate = true
1447		}
1448	}
1449
1450	for ; p.charsRight() > 0; firstChar = false {
1451		fTranslatedChar := false
1452		ch = p.moveRightGetChar()
1453		if ch == ']' {
1454			if !firstChar {
1455				closed = true
1456				break
1457			} else if p.useOptionE() {
1458				if !scanOnly {
1459					cc.addRanges(NoneClass().ranges)
1460				}
1461				closed = true
1462				break
1463			}
1464
1465		} else if ch == '\\' && p.charsRight() > 0 {
1466			switch ch = p.moveRightGetChar(); ch {
1467			case 'D', 'd':
1468				if !scanOnly {
1469					if inRange {
1470						return nil, p.getErr(ErrBadClassInCharRange, ch)
1471					}
1472					cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
1473				}
1474				continue
1475
1476			case 'S', 's':
1477				if !scanOnly {
1478					if inRange {
1479						return nil, p.getErr(ErrBadClassInCharRange, ch)
1480					}
1481					cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
1482				}
1483				continue
1484
1485			case 'W', 'w':
1486				if !scanOnly {
1487					if inRange {
1488						return nil, p.getErr(ErrBadClassInCharRange, ch)
1489					}
1490
1491					cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
1492				}
1493				continue
1494
1495			case 'p', 'P':
1496				if !scanOnly {
1497					if inRange {
1498						return nil, p.getErr(ErrBadClassInCharRange, ch)
1499					}
1500					prop, err := p.parseProperty()
1501					if err != nil {
1502						return nil, err
1503					}
1504					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1505				} else {
1506					p.parseProperty()
1507				}
1508
1509				continue
1510
1511			case '-':
1512				if !scanOnly {
1513					cc.addRange(ch, ch)
1514				}
1515				continue
1516
1517			default:
1518				p.moveLeft()
1519				var err error
1520				ch, err = p.scanCharEscape() // non-literal character
1521				if err != nil {
1522					return nil, err
1523				}
1524				fTranslatedChar = true
1525				break // this break will only break out of the switch
1526			}
1527		} else if ch == '[' {
1528			// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1529			// It currently doesn't do anything other than skip the whole thing!
1530			if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1531				savePos := p.textpos()
1532
1533				p.moveRight(1)
1534				negate := false
1535				if p.charsRight() > 1 && p.rightChar(0) == '^' {
1536					negate = true
1537					p.moveRight(1)
1538				}
1539
1540				nm := p.scanCapname() // snag the name
1541				if !scanOnly && p.useRE2() {
1542					// look up the name since these are valid for RE2
1543					// add the group based on the name
1544					if ok := cc.addNamedASCII(nm, negate); !ok {
1545						return nil, p.getErr(ErrInvalidCharRange)
1546					}
1547				}
1548				if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1549					p.textto(savePos)
1550				} else if p.useRE2() {
1551					// move on
1552					continue
1553				}
1554			}
1555		}
1556
1557		if inRange {
1558			inRange = false
1559			if !scanOnly {
1560				if ch == '[' && !fTranslatedChar && !firstChar {
1561					// We thought we were in a range, but we're actually starting a subtraction.
1562					// In that case, we'll add chPrev to our char class, skip the opening [, and
1563					// scan the new character class recursively.
1564					cc.addChar(chPrev)
1565					sub, err := p.scanCharSet(caseInsensitive, false)
1566					if err != nil {
1567						return nil, err
1568					}
1569					cc.addSubtraction(sub)
1570
1571					if p.charsRight() > 0 && p.rightChar(0) != ']' {
1572						return nil, p.getErr(ErrSubtractionMustBeLast)
1573					}
1574				} else {
1575					// a regular range, like a-z
1576					if chPrev > ch {
1577						return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
1578					}
1579					cc.addRange(chPrev, ch)
1580				}
1581			}
1582		} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1583			// this could be the start of a range
1584			chPrev = ch
1585			inRange = true
1586			p.moveRight(1)
1587		} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1588			// we aren't in a range, and now there is a subtraction.  Usually this happens
1589			// only when a subtraction follows a range, like [a-z-[b]]
1590			if !scanOnly {
1591				p.moveRight(1)
1592				sub, err := p.scanCharSet(caseInsensitive, false)
1593				if err != nil {
1594					return nil, err
1595				}
1596				cc.addSubtraction(sub)
1597
1598				if p.charsRight() > 0 && p.rightChar(0) != ']' {
1599					return nil, p.getErr(ErrSubtractionMustBeLast)
1600				}
1601			} else {
1602				p.moveRight(1)
1603				p.scanCharSet(caseInsensitive, true)
1604			}
1605		} else {
1606			if !scanOnly {
1607				cc.addRange(ch, ch)
1608			}
1609		}
1610	}
1611
1612	if !closed {
1613		return nil, p.getErr(ErrUnterminatedBracket)
1614	}
1615
1616	if !scanOnly && caseInsensitive {
1617		cc.addLowercase()
1618	}
1619
1620	return cc, nil
1621}
1622
1623// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1624func (p *parser) scanDecimal() (int, error) {
1625	i := 0
1626	var d int
1627
1628	for p.charsRight() > 0 {
1629		d = int(p.rightChar(0) - '0')
1630		if d < 0 || d > 9 {
1631			break
1632		}
1633		p.moveRight(1)
1634
1635		if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1636			return 0, p.getErr(ErrCaptureGroupOutOfRange)
1637		}
1638
1639		i *= 10
1640		i += d
1641	}
1642
1643	return int(i), nil
1644}
1645
1646// Returns true for options allowed only at the top level
1647func isOnlyTopOption(option RegexOptions) bool {
1648	return option == RightToLeft || option == ECMAScript || option == RE2
1649}
1650
1651// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1652func (p *parser) scanOptions() {
1653
1654	for off := false; p.charsRight() > 0; p.moveRight(1) {
1655		ch := p.rightChar(0)
1656
1657		if ch == '-' {
1658			off = true
1659		} else if ch == '+' {
1660			off = false
1661		} else {
1662			option := optionFromCode(ch)
1663			if option == 0 || isOnlyTopOption(option) {
1664				return
1665			}
1666
1667			if off {
1668				p.options &= ^option
1669			} else {
1670				p.options |= option
1671			}
1672		}
1673	}
1674}
1675
1676// Scans \ code for escape codes that map to single unicode chars.
1677func (p *parser) scanCharEscape() (r rune, err error) {
1678
1679	ch := p.moveRightGetChar()
1680
1681	if ch >= '0' && ch <= '7' {
1682		p.moveLeft()
1683		return p.scanOctal(), nil
1684	}
1685
1686	pos := p.textpos()
1687
1688	switch ch {
1689	case 'x':
1690		// support for \x{HEX} syntax from Perl and PCRE
1691		if p.charsRight() > 0 && p.rightChar(0) == '{' {
1692			if p.useOptionE() {
1693				return ch, nil
1694			}
1695			p.moveRight(1)
1696			return p.scanHexUntilBrace()
1697		} else {
1698			r, err = p.scanHex(2)
1699		}
1700	case 'u':
1701		// ECMAscript suppot \u{HEX} only if `u` is also set
1702		if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1703			p.moveRight(1)
1704			return p.scanHexUntilBrace()
1705		} else {
1706			r, err = p.scanHex(4)
1707		}
1708	case 'a':
1709		return '\u0007', nil
1710	case 'b':
1711		return '\b', nil
1712	case 'e':
1713		return '\u001B', nil
1714	case 'f':
1715		return '\f', nil
1716	case 'n':
1717		return '\n', nil
1718	case 'r':
1719		return '\r', nil
1720	case 't':
1721		return '\t', nil
1722	case 'v':
1723		return '\u000B', nil
1724	case 'c':
1725		r, err = p.scanControl()
1726	default:
1727		if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
1728			return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1729		}
1730		return ch, nil
1731	}
1732	if err != nil && p.useOptionE() {
1733		p.textto(pos)
1734		return ch, nil
1735	}
1736	return
1737}
1738
1739// Grabs and converts an ascii control character
1740func (p *parser) scanControl() (rune, error) {
1741	if p.charsRight() <= 0 {
1742		return 0, p.getErr(ErrMissingControl)
1743	}
1744
1745	ch := p.moveRightGetChar()
1746
1747	// \ca interpreted as \cA
1748
1749	if ch >= 'a' && ch <= 'z' {
1750		ch = (ch - ('a' - 'A'))
1751	}
1752	ch = (ch - '@')
1753	if ch >= 0 && ch < ' ' {
1754		return ch, nil
1755	}
1756
1757	return 0, p.getErr(ErrUnrecognizedControl)
1758
1759}
1760
1761// Scan hex digits until we hit a closing brace.
1762// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1763func (p *parser) scanHexUntilBrace() (rune, error) {
1764	// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1765	// so we can enforce that
1766	i := 0
1767	hasContent := false
1768
1769	for p.charsRight() > 0 {
1770		ch := p.moveRightGetChar()
1771		if ch == '}' {
1772			// hit our close brace, we're done here
1773			// prevent \x{}
1774			if !hasContent {
1775				return 0, p.getErr(ErrTooFewHex)
1776			}
1777			return rune(i), nil
1778		}
1779		hasContent = true
1780		// no brace needs to be hex digit
1781		d := hexDigit(ch)
1782		if d < 0 {
1783			return 0, p.getErr(ErrMissingBrace)
1784		}
1785
1786		i *= 0x10
1787		i += d
1788
1789		if i > unicode.MaxRune {
1790			return 0, p.getErr(ErrInvalidHex)
1791		}
1792	}
1793
1794	// we only make it here if we run out of digits without finding the brace
1795	return 0, p.getErr(ErrMissingBrace)
1796}
1797
1798// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1799func (p *parser) scanHex(c int) (rune, error) {
1800
1801	i := 0
1802
1803	if p.charsRight() >= c {
1804		for c > 0 {
1805			d := hexDigit(p.moveRightGetChar())
1806			if d < 0 {
1807				break
1808			}
1809			i *= 0x10
1810			i += d
1811			c--
1812		}
1813	}
1814
1815	if c > 0 {
1816		return 0, p.getErr(ErrTooFewHex)
1817	}
1818
1819	return rune(i), nil
1820}
1821
1822// Returns n <= 0xF for a hex digit.
1823func hexDigit(ch rune) int {
1824
1825	if d := uint(ch - '0'); d <= 9 {
1826		return int(d)
1827	}
1828
1829	if d := uint(ch - 'a'); d <= 5 {
1830		return int(d + 0xa)
1831	}
1832
1833	if d := uint(ch - 'A'); d <= 5 {
1834		return int(d + 0xa)
1835	}
1836
1837	return -1
1838}
1839
1840// Scans up to three octal digits (stops before exceeding 0377).
1841func (p *parser) scanOctal() rune {
1842	// Consume octal chars only up to 3 digits and value 0377
1843
1844	c := 3
1845
1846	if c > p.charsRight() {
1847		c = p.charsRight()
1848	}
1849
1850	//we know the first char is good because the caller had to check
1851	i := 0
1852	d := int(p.rightChar(0) - '0')
1853	for c > 0 && d <= 7 && d >= 0 {
1854		if i >= 0x20 && p.useOptionE() {
1855			break
1856		}
1857		i *= 8
1858		i += d
1859		c--
1860
1861		p.moveRight(1)
1862		if !p.rightMost() {
1863			d = int(p.rightChar(0) - '0')
1864		}
1865	}
1866
1867	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
1868	// is simply to truncate the high bits.
1869	i &= 0xFF
1870
1871	return rune(i)
1872}
1873
1874// Returns the current parsing position.
1875func (p *parser) textpos() int {
1876	return p.currentPos
1877}
1878
1879// Zaps to a specific parsing position.
1880func (p *parser) textto(pos int) {
1881	p.currentPos = pos
1882}
1883
1884// Returns the char at the right of the current parsing position and advances to the right.
1885func (p *parser) moveRightGetChar() rune {
1886	ch := p.pattern[p.currentPos]
1887	p.currentPos++
1888	return ch
1889}
1890
1891// Moves the current position to the right.
1892func (p *parser) moveRight(i int) {
1893	// default would be 1
1894	p.currentPos += i
1895}
1896
1897// Moves the current parsing position one to the left.
1898func (p *parser) moveLeft() {
1899	p.currentPos--
1900}
1901
1902// Returns the char left of the current parsing position.
1903func (p *parser) charAt(i int) rune {
1904	return p.pattern[i]
1905}
1906
1907// Returns the char i chars right of the current parsing position.
1908func (p *parser) rightChar(i int) rune {
1909	// default would be 0
1910	return p.pattern[p.currentPos+i]
1911}
1912
1913// Number of characters to the right of the current parsing position.
1914func (p *parser) charsRight() int {
1915	return len(p.pattern) - p.currentPos
1916}
1917
1918func (p *parser) rightMost() bool {
1919	return p.currentPos == len(p.pattern)
1920}
1921
1922// Looks up the slot number for a given name
1923func (p *parser) captureSlotFromName(capname string) int {
1924	return p.capnames[capname]
1925}
1926
1927// True if the capture slot was noted
1928func (p *parser) isCaptureSlot(i int) bool {
1929	if p.caps != nil {
1930		_, ok := p.caps[i]
1931		return ok
1932	}
1933
1934	return (i >= 0 && i < p.capsize)
1935}
1936
1937// Looks up the slot number for a given name
1938func (p *parser) isCaptureName(capname string) bool {
1939	if p.capnames == nil {
1940		return false
1941	}
1942
1943	_, ok := p.capnames[capname]
1944	return ok
1945}
1946
1947// option shortcuts
1948
1949// True if N option disabling '(' autocapture is on.
1950func (p *parser) useOptionN() bool {
1951	return (p.options & ExplicitCapture) != 0
1952}
1953
1954// True if I option enabling case-insensitivity is on.
1955func (p *parser) useOptionI() bool {
1956	return (p.options & IgnoreCase) != 0
1957}
1958
1959// True if M option altering meaning of $ and ^ is on.
1960func (p *parser) useOptionM() bool {
1961	return (p.options & Multiline) != 0
1962}
1963
1964// True if S option altering meaning of . is on.
1965func (p *parser) useOptionS() bool {
1966	return (p.options & Singleline) != 0
1967}
1968
1969// True if X option enabling whitespace/comment mode is on.
1970func (p *parser) useOptionX() bool {
1971	return (p.options & IgnorePatternWhitespace) != 0
1972}
1973
1974// True if E option enabling ECMAScript behavior on.
1975func (p *parser) useOptionE() bool {
1976	return (p.options & ECMAScript) != 0
1977}
1978
1979// true to use RE2 compatibility parsing behavior.
1980func (p *parser) useRE2() bool {
1981	return (p.options & RE2) != 0
1982}
1983
1984// True if U option enabling ECMAScript's Unicode behavior on.
1985func (p *parser) useOptionU() bool {
1986	return (p.options & Unicode) != 0
1987}
1988
1989// True if options stack is empty.
1990func (p *parser) emptyOptionsStack() bool {
1991	return len(p.optionsStack) == 0
1992}
1993
1994// Finish the current quantifiable (when a quantifier is not found or is not possible)
1995func (p *parser) addConcatenate() {
1996	// The first (| inside a Testgroup group goes directly to the group
1997	p.concatenation.addChild(p.unit)
1998	p.unit = nil
1999}
2000
2001// Finish the current quantifiable (when a quantifier is found)
2002func (p *parser) addConcatenate3(lazy bool, min, max int) {
2003	p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
2004	p.unit = nil
2005}
2006
2007// Sets the current unit to a single char node
2008func (p *parser) addUnitOne(ch rune) {
2009	if p.useOptionI() {
2010		ch = unicode.ToLower(ch)
2011	}
2012
2013	p.unit = newRegexNodeCh(ntOne, p.options, ch)
2014}
2015
2016// Sets the current unit to a single inverse-char node
2017func (p *parser) addUnitNotone(ch rune) {
2018	if p.useOptionI() {
2019		ch = unicode.ToLower(ch)
2020	}
2021
2022	p.unit = newRegexNodeCh(ntNotone, p.options, ch)
2023}
2024
2025// Sets the current unit to a single set node
2026func (p *parser) addUnitSet(set *CharSet) {
2027	p.unit = newRegexNodeSet(ntSet, p.options, set)
2028}
2029
2030// Sets the current unit to a subtree
2031func (p *parser) addUnitNode(node *regexNode) {
2032	p.unit = node
2033}
2034
2035// Sets the current unit to an assertion of the specified type
2036func (p *parser) addUnitType(t nodeType) {
2037	p.unit = newRegexNode(t, p.options)
2038}
2039
2040// Finish the current group (in response to a ')' or end)
2041func (p *parser) addGroup() error {
2042	if p.group.t == ntTestgroup || p.group.t == ntTestref {
2043		p.group.addChild(p.concatenation.reverseLeft())
2044		if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2045			return p.getErr(ErrTooManyAlternates)
2046		}
2047	} else {
2048		p.alternation.addChild(p.concatenation.reverseLeft())
2049		p.group.addChild(p.alternation)
2050	}
2051
2052	p.unit = p.group
2053	return nil
2054}
2055
2056// Pops the option stack, but keeps the current options unchanged.
2057func (p *parser) popKeepOptions() {
2058	lastIdx := len(p.optionsStack) - 1
2059	p.optionsStack = p.optionsStack[:lastIdx]
2060}
2061
2062// Recalls options from the stack.
2063func (p *parser) popOptions() {
2064	lastIdx := len(p.optionsStack) - 1
2065	// get the last item on the stack and then remove it by reslicing
2066	p.options = p.optionsStack[lastIdx]
2067	p.optionsStack = p.optionsStack[:lastIdx]
2068}
2069
2070// Saves options on a stack.
2071func (p *parser) pushOptions() {
2072	p.optionsStack = append(p.optionsStack, p.options)
2073}
2074
2075// Add a string to the last concatenate.
2076func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2077	var node *regexNode
2078
2079	if cch == 0 {
2080		return
2081	}
2082
2083	if cch > 1 {
2084		str := make([]rune, cch)
2085		copy(str, p.pattern[pos:pos+cch])
2086
2087		if p.useOptionI() && !isReplacement {
2088			// We do the ToLower character by character for consistency.  With surrogate chars, doing
2089			// a ToLower on the entire string could actually change the surrogate pair.  This is more correct
2090			// linguistically, but since Regex doesn't support surrogates, it's more important to be
2091			// consistent.
2092			for i := 0; i < len(str); i++ {
2093				str[i] = unicode.ToLower(str[i])
2094			}
2095		}
2096
2097		node = newRegexNodeStr(ntMulti, p.options, str)
2098	} else {
2099		ch := p.charAt(pos)
2100
2101		if p.useOptionI() && !isReplacement {
2102			ch = unicode.ToLower(ch)
2103		}
2104
2105		node = newRegexNodeCh(ntOne, p.options, ch)
2106	}
2107
2108	p.concatenation.addChild(node)
2109}
2110
2111// Push the parser state (in response to an open paren)
2112func (p *parser) pushGroup() {
2113	p.group.next = p.stack
2114	p.alternation.next = p.group
2115	p.concatenation.next = p.alternation
2116	p.stack = p.concatenation
2117}
2118
2119// Remember the pushed state (in response to a ')')
2120func (p *parser) popGroup() error {
2121	p.concatenation = p.stack
2122	p.alternation = p.concatenation.next
2123	p.group = p.alternation.next
2124	p.stack = p.group.next
2125
2126	// The first () inside a Testgroup group goes directly to the group
2127	if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2128		if p.unit == nil {
2129			return p.getErr(ErrConditionalExpression)
2130		}
2131
2132		p.group.addChild(p.unit)
2133		p.unit = nil
2134	}
2135	return nil
2136}
2137
2138// True if the group stack is empty.
2139func (p *parser) emptyStack() bool {
2140	return p.stack == nil
2141}
2142
2143// Start a new round for the parser state (in response to an open paren or string start)
2144func (p *parser) startGroup(openGroup *regexNode) {
2145	p.group = openGroup
2146	p.alternation = newRegexNode(ntAlternate, p.options)
2147	p.concatenation = newRegexNode(ntConcatenate, p.options)
2148}
2149
2150// Finish the current concatenation (in response to a |)
2151func (p *parser) addAlternate() {
2152	// The | parts inside a Testgroup group go directly to the group
2153
2154	if p.group.t == ntTestgroup || p.group.t == ntTestref {
2155		p.group.addChild(p.concatenation.reverseLeft())
2156	} else {
2157		p.alternation.addChild(p.concatenation.reverseLeft())
2158	}
2159
2160	p.concatenation = newRegexNode(ntConcatenate, p.options)
2161}
2162
2163// For categorizing ascii characters.
2164
2165const (
2166	Q byte = 5 // quantifier
2167	S      = 4 // ordinary stopper
2168	Z      = 3 // ScanBlank stopper
2169	X      = 2 // whitespace
2170	E      = 1 // should be escaped
2171)
2172
2173var _category = []byte{
2174	//01  2  3  4  5  6  7  8  9  A  B  C  D  E  F  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
2175	0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2176	// !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
2177	X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2178	//@A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
2179	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2180	//'a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~
2181	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2182}
2183
2184func isSpace(ch rune) bool {
2185	return (ch <= ' ' && _category[ch] == X)
2186}
2187
2188// Returns true for those characters that terminate a string of ordinary chars.
2189func isSpecial(ch rune) bool {
2190	return (ch <= '|' && _category[ch] >= S)
2191}
2192
2193// Returns true for those characters that terminate a string of ordinary chars.
2194func isStopperX(ch rune) bool {
2195	return (ch <= '|' && _category[ch] >= X)
2196}
2197
2198// Returns true for those characters that begin a quantifier.
2199func isQuantifier(ch rune) bool {
2200	return (ch <= '{' && _category[ch] >= Q)
2201}
2202
2203func (p *parser) isTrueQuantifier() bool {
2204	nChars := p.charsRight()
2205	if nChars == 0 {
2206		return false
2207	}
2208
2209	startpos := p.textpos()
2210	ch := p.charAt(startpos)
2211	if ch != '{' {
2212		return ch <= '{' && _category[ch] >= Q
2213	}
2214
2215	//UGLY: this is ugly -- the original code was ugly too
2216	pos := startpos
2217	for {
2218		nChars--
2219		if nChars <= 0 {
2220			break
2221		}
2222		pos++
2223		ch = p.charAt(pos)
2224		if ch < '0' || ch > '9' {
2225			break
2226		}
2227	}
2228
2229	if nChars == 0 || pos-startpos == 1 {
2230		return false
2231	}
2232	if ch == '}' {
2233		return true
2234	}
2235	if ch != ',' {
2236		return false
2237	}
2238	for {
2239		nChars--
2240		if nChars <= 0 {
2241			break
2242		}
2243		pos++
2244		ch = p.charAt(pos)
2245		if ch < '0' || ch > '9' {
2246			break
2247		}
2248	}
2249
2250	return nChars > 0 && ch == '}'
2251}