summaryrefslogtreecommitdiff
path: root/vendor/github.com/dlclark/regexp2/syntax/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax/parser.go')
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/parser.go2251
1 files changed, 2251 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/parser.go b/vendor/github.com/dlclark/regexp2/syntax/parser.go
new file mode 100644
index 0000000..9dc6e31
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/parser.go
@@ -0,0 +1,2251 @@
+package syntax
+
+import (
+ "fmt"
+ "math"
+ "os"
+ "sort"
+ "strconv"
+ "unicode"
+)
+
+type RegexOptions int32
+
+const (
+ IgnoreCase RegexOptions = 0x0001 // "i"
+ Multiline = 0x0002 // "m"
+ ExplicitCapture = 0x0004 // "n"
+ Compiled = 0x0008 // "c"
+ Singleline = 0x0010 // "s"
+ IgnorePatternWhitespace = 0x0020 // "x"
+ RightToLeft = 0x0040 // "r"
+ Debug = 0x0080 // "d"
+ ECMAScript = 0x0100 // "e"
+ RE2 = 0x0200 // RE2 compat mode
+ Unicode = 0x0400 // "u"
+)
+
+func optionFromCode(ch rune) RegexOptions {
+ // case-insensitive
+ switch ch {
+ case 'i', 'I':
+ return IgnoreCase
+ case 'r', 'R':
+ return RightToLeft
+ case 'm', 'M':
+ return Multiline
+ case 'n', 'N':
+ return ExplicitCapture
+ case 's', 'S':
+ return Singleline
+ case 'x', 'X':
+ return IgnorePatternWhitespace
+ case 'd', 'D':
+ return Debug
+ case 'e', 'E':
+ return ECMAScript
+ case 'u', 'U':
+ return Unicode
+ default:
+ return 0
+ }
+}
+
+// An Error describes a failure to parse a regular expression
+// and gives the offending expression.
+type Error struct {
+ Code ErrorCode
+ Expr string
+ Args []interface{}
+}
+
+func (e *Error) Error() string {
+ if len(e.Args) == 0 {
+ return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
+ }
+ return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
+}
+
+// An ErrorCode describes a failure to parse a regular expression.
+type ErrorCode string
+
+const (
+ // internal issue
+ ErrInternalError ErrorCode = "regexp/syntax: internal error"
+ // Parser errors
+ ErrUnterminatedComment = "unterminated comment"
+ ErrInvalidCharRange = "invalid character class range"
+ ErrInvalidRepeatSize = "invalid repeat count"
+ ErrInvalidUTF8 = "invalid UTF-8"
+ ErrCaptureGroupOutOfRange = "capture group number out of range"
+ ErrUnexpectedParen = "unexpected )"
+ ErrMissingParen = "missing closing )"
+ ErrMissingBrace = "missing closing }"
+ ErrInvalidRepeatOp = "invalid nested repetition operator"
+ ErrMissingRepeatArgument = "missing argument to repetition operator"
+ ErrConditionalExpression = "illegal conditional (?(...)) expression"
+ ErrTooManyAlternates = "too many | in (?()|)"
+ ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
+ ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
+ ErrCapNumNotZero = "capture number cannot be zero"
+ ErrUndefinedBackRef = "reference to undefined group number %v"
+ ErrUndefinedNameRef = "reference to undefined group name %v"
+ ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
+ ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
+ ErrMalformedReference = "(?(%v) ) malformed"
+ ErrUndefinedReference = "(?(%v) ) reference to undefined group"
+ ErrIllegalEndEscape = "illegal \\ at end of pattern"
+ ErrMalformedSlashP = "malformed \\p{X} character escape"
+ ErrIncompleteSlashP = "incomplete \\p{X} character escape"
+ ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
+ ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
+ ErrMissingControl = "missing control character"
+ ErrUnrecognizedControl = "unrecognized control character"
+ ErrTooFewHex = "insufficient hexadecimal digits"
+ ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
+ ErrMalformedNameRef = "malformed \\k<...> named back reference"
+ ErrBadClassInCharRange = "cannot include class \\%v in character range"
+ ErrUnterminatedBracket = "unterminated [] set"
+ ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
+ ErrReversedCharRange = "[%c-%c] range in reverse order"
+)
+
+func (e ErrorCode) String() string {
+ return string(e)
+}
+
+type parser struct {
+ stack *regexNode
+ group *regexNode
+ alternation *regexNode
+ concatenation *regexNode
+ unit *regexNode
+
+ patternRaw string
+ pattern []rune
+
+ currentPos int
+ specialCase *unicode.SpecialCase
+
+ autocap int
+ capcount int
+ captop int
+ capsize int
+
+ caps map[int]int
+ capnames map[string]int
+
+ capnumlist []int
+ capnamelist []string
+
+ options RegexOptions
+ optionsStack []RegexOptions
+ ignoreNextParen bool
+}
+
+const (
+ maxValueDiv10 int = math.MaxInt32 / 10
+ maxValueMod10 = math.MaxInt32 % 10
+)
+
+// Parse converts a regex string into a parse tree
+func Parse(re string, op RegexOptions) (*RegexTree, error) {
+ p := parser{
+ options: op,
+ caps: make(map[int]int),
+ }
+ p.setPattern(re)
+
+ if err := p.countCaptures(); err != nil {
+ return nil, err
+ }
+
+ p.reset(op)
+ root, err := p.scanRegex()
+
+ if err != nil {
+ return nil, err
+ }
+ tree := &RegexTree{
+ root: root,
+ caps: p.caps,
+ capnumlist: p.capnumlist,
+ captop: p.captop,
+ Capnames: p.capnames,
+ Caplist: p.capnamelist,
+ options: op,
+ }
+
+ if tree.options&Debug > 0 {
+ os.Stdout.WriteString(tree.Dump())
+ }
+
+ return tree, nil
+}
+
+func (p *parser) setPattern(pattern string) {
+ p.patternRaw = pattern
+ p.pattern = make([]rune, 0, len(pattern))
+
+ //populate our rune array to handle utf8 encoding
+ for _, r := range pattern {
+ p.pattern = append(p.pattern, r)
+ }
+}
+func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
+ return &Error{Code: code, Expr: p.patternRaw, Args: args}
+}
+
+func (p *parser) noteCaptureSlot(i, pos int) {
+ if _, ok := p.caps[i]; !ok {
+ // the rhs of the hashtable isn't used in the parser
+ p.caps[i] = pos
+ p.capcount++
+
+ if p.captop <= i {
+ if i == math.MaxInt32 {
+ p.captop = i
+ } else {
+ p.captop = i + 1
+ }
+ }
+ }
+}
+
+func (p *parser) noteCaptureName(name string, pos int) {
+ if p.capnames == nil {
+ p.capnames = make(map[string]int)
+ }
+
+ if _, ok := p.capnames[name]; !ok {
+ p.capnames[name] = pos
+ p.capnamelist = append(p.capnamelist, name)
+ }
+}
+
+func (p *parser) assignNameSlots() {
+ if p.capnames != nil {
+ for _, name := range p.capnamelist {
+ for p.isCaptureSlot(p.autocap) {
+ p.autocap++
+ }
+ pos := p.capnames[name]
+ p.capnames[name] = p.autocap
+ p.noteCaptureSlot(p.autocap, pos)
+
+ p.autocap++
+ }
+ }
+
+ // if the caps array has at least one gap, construct the list of used slots
+ if p.capcount < p.captop {
+ p.capnumlist = make([]int, p.capcount)
+ i := 0
+
+ for k := range p.caps {
+ p.capnumlist[i] = k
+ i++
+ }
+
+ sort.Ints(p.capnumlist)
+ }
+
+ // merge capsnumlist into capnamelist
+ if p.capnames != nil || p.capnumlist != nil {
+ var oldcapnamelist []string
+ var next int
+ var k int
+
+ if p.capnames == nil {
+ oldcapnamelist = nil
+ p.capnames = make(map[string]int)
+ p.capnamelist = []string{}
+ next = -1
+ } else {
+ oldcapnamelist = p.capnamelist
+ p.capnamelist = []string{}
+ next = p.capnames[oldcapnamelist[0]]
+ }
+
+ for i := 0; i < p.capcount; i++ {
+ j := i
+ if p.capnumlist != nil {
+ j = p.capnumlist[i]
+ }
+
+ if next == j {
+ p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
+ k++
+
+ if k == len(oldcapnamelist) {
+ next = -1
+ } else {
+ next = p.capnames[oldcapnamelist[k]]
+ }
+
+ } else {
+ //feature: culture?
+ str := strconv.Itoa(j)
+ p.capnamelist = append(p.capnamelist, str)
+ p.capnames[str] = j
+ }
+ }
+ }
+}
+
+func (p *parser) consumeAutocap() int {
+ r := p.autocap
+ p.autocap++
+ return r
+}
+
+// CountCaptures is a prescanner for deducing the slots used for
+// captures by doing a partial tokenization of the pattern.
+func (p *parser) countCaptures() error {
+ var ch rune
+
+ p.noteCaptureSlot(0, 0)
+
+ p.autocap = 1
+
+ for p.charsRight() > 0 {
+ pos := p.textpos()
+ ch = p.moveRightGetChar()
+ switch ch {
+ case '\\':
+ if p.charsRight() > 0 {
+ p.scanBackslash(true)
+ }
+
+ case '#':
+ if p.useOptionX() {
+ p.moveLeft()
+ p.scanBlank()
+ }
+
+ case '[':
+ p.scanCharSet(false, true)
+
+ case ')':
+ if !p.emptyOptionsStack() {
+ p.popOptions()
+ }
+
+ case '(':
+ if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
+ p.moveLeft()
+ p.scanBlank()
+ } else {
+ p.pushOptions()
+ if p.charsRight() > 0 && p.rightChar(0) == '?' {
+ // we have (?...
+ p.moveRight(1)
+
+ if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
+ // named group: (?<... or (?'...
+
+ p.moveRight(1)
+ ch = p.rightChar(0)
+
+ if ch != '0' && IsWordChar(ch) {
+ if ch >= '1' && ch <= '9' {
+ dec, err := p.scanDecimal()
+ if err != nil {
+ return err
+ }
+ p.noteCaptureSlot(dec, pos)
+ } else {
+ p.noteCaptureName(p.scanCapname(), pos)
+ }
+ }
+ } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
+ // RE2-compat (?P<)
+ p.moveRight(2)
+ ch = p.rightChar(0)
+ if IsWordChar(ch) {
+ p.noteCaptureName(p.scanCapname(), pos)
+ }
+
+ } else {
+ // (?...
+
+ // get the options if it's an option construct (?cimsx-cimsx...)
+ p.scanOptions()
+
+ if p.charsRight() > 0 {
+ if p.rightChar(0) == ')' {
+ // (?cimsx-cimsx)
+ p.moveRight(1)
+ p.popKeepOptions()
+ } else if p.rightChar(0) == '(' {
+ // alternation construct: (?(foo)yes|no)
+ // ignore the next paren so we don't capture the condition
+ p.ignoreNextParen = true
+
+ // break from here so we don't reset ignoreNextParen
+ continue
+ }
+ }
+ }
+ } else {
+ if !p.useOptionN() && !p.ignoreNextParen {
+ p.noteCaptureSlot(p.consumeAutocap(), pos)
+ }
+ }
+ }
+
+ p.ignoreNextParen = false
+
+ }
+ }
+
+ p.assignNameSlots()
+ return nil
+}
+
+func (p *parser) reset(topopts RegexOptions) {
+ p.currentPos = 0
+ p.autocap = 1
+ p.ignoreNextParen = false
+
+ if len(p.optionsStack) > 0 {
+ p.optionsStack = p.optionsStack[:0]
+ }
+
+ p.options = topopts
+ p.stack = nil
+}
+
+func (p *parser) scanRegex() (*regexNode, error) {
+ ch := '@' // nonspecial ch, means at beginning
+ isQuant := false
+
+ p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
+
+ for p.charsRight() > 0 {
+ wasPrevQuantifier := isQuant
+ isQuant = false
+
+ if err := p.scanBlank(); err != nil {
+ return nil, err
+ }
+
+ startpos := p.textpos()
+
+ // move past all of the normal characters. We'll stop when we hit some kind of control character,
+ // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
+ if p.useOptionX() {
+ for p.charsRight() > 0 {
+ ch = p.rightChar(0)
+ //UGLY: clean up, this is ugly
+ if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
+ break
+ }
+ p.moveRight(1)
+ }
+ } else {
+ for p.charsRight() > 0 {
+ ch = p.rightChar(0)
+ if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
+ break
+ }
+ p.moveRight(1)
+ }
+ }
+
+ endpos := p.textpos()
+
+ p.scanBlank()
+
+ if p.charsRight() == 0 {
+ ch = '!' // nonspecial, means at end
+ } else if ch = p.rightChar(0); isSpecial(ch) {
+ isQuant = isQuantifier(ch)
+ p.moveRight(1)
+ } else {
+ ch = ' ' // nonspecial, means at ordinary char
+ }
+
+ if startpos < endpos {
+ cchUnquantified := endpos - startpos
+ if isQuant {
+ cchUnquantified--
+ }
+ wasPrevQuantifier = false
+
+ if cchUnquantified > 0 {
+ p.addToConcatenate(startpos, cchUnquantified, false)
+ }
+
+ if isQuant {
+ p.addUnitOne(p.charAt(endpos - 1))
+ }
+ }
+
+ switch ch {
+ case '!':
+ goto BreakOuterScan
+
+ case ' ':
+ goto ContinueOuterScan
+
+ case '[':
+ cc, err := p.scanCharSet(p.useOptionI(), false)
+ if err != nil {
+ return nil, err
+ }
+ p.addUnitSet(cc)
+
+ case '(':
+ p.pushOptions()
+
+ if grouper, err := p.scanGroupOpen(); err != nil {
+ return nil, err
+ } else if grouper == nil {
+ p.popKeepOptions()
+ } else {
+ p.pushGroup()
+ p.startGroup(grouper)
+ }
+
+ continue
+
+ case '|':
+ p.addAlternate()
+ goto ContinueOuterScan
+
+ case ')':
+ if p.emptyStack() {
+ return nil, p.getErr(ErrUnexpectedParen)
+ }
+
+ if err := p.addGroup(); err != nil {
+ return nil, err
+ }
+ if err := p.popGroup(); err != nil {
+ return nil, err
+ }
+ p.popOptions()
+
+ if p.unit == nil {
+ goto ContinueOuterScan
+ }
+
+ case '\\':
+ n, err := p.scanBackslash(false)
+ if err != nil {
+ return nil, err
+ }
+ p.addUnitNode(n)
+
+ case '^':
+ if p.useOptionM() {
+ p.addUnitType(ntBol)
+ } else {
+ p.addUnitType(ntBeginning)
+ }
+
+ case '$':
+ if p.useOptionM() {
+ p.addUnitType(ntEol)
+ } else {
+ p.addUnitType(ntEndZ)
+ }
+
+ case '.':
+ if p.useOptionE() {
+ p.addUnitSet(ECMAAnyClass())
+ } else if p.useOptionS() {
+ p.addUnitSet(AnyClass())
+ } else {
+ p.addUnitNotone('\n')
+ }
+
+ case '{', '*', '+', '?':
+ if p.unit == nil {
+ if wasPrevQuantifier {
+ return nil, p.getErr(ErrInvalidRepeatOp)
+ } else {
+ return nil, p.getErr(ErrMissingRepeatArgument)
+ }
+ }
+ p.moveLeft()
+
+ default:
+ return nil, p.getErr(ErrInternalError)
+ }
+
+ if err := p.scanBlank(); err != nil {
+ return nil, err
+ }
+
+ if p.charsRight() > 0 {
+ isQuant = p.isTrueQuantifier()
+ }
+ if p.charsRight() == 0 || !isQuant {
+ //maintain odd C# assignment order -- not sure if required, could clean up?
+ p.addConcatenate()
+ goto ContinueOuterScan
+ }
+
+ ch = p.moveRightGetChar()
+
+ // Handle quantifiers
+ for p.unit != nil {
+ var min, max int
+ var lazy bool
+
+ switch ch {
+ case '*':
+ min = 0
+ max = math.MaxInt32
+
+ case '?':
+ min = 0
+ max = 1
+
+ case '+':
+ min = 1
+ max = math.MaxInt32
+
+ case '{':
+ {
+ var err error
+ startpos = p.textpos()
+ if min, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+ max = min
+ if startpos < p.textpos() {
+ if p.charsRight() > 0 && p.rightChar(0) == ',' {
+ p.moveRight(1)
+ if p.charsRight() == 0 || p.rightChar(0) == '}' {
+ max = math.MaxInt32
+ } else {
+ if max, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+ }
+ }
+ }
+
+ if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
+ p.addConcatenate()
+ p.textto(startpos - 1)
+ goto ContinueOuterScan
+ }
+ }
+
+ default:
+ return nil, p.getErr(ErrInternalError)
+ }
+
+ if err := p.scanBlank(); err != nil {
+ return nil, err
+ }
+
+ if p.charsRight() == 0 || p.rightChar(0) != '?' {
+ lazy = false
+ } else {
+ p.moveRight(1)
+ lazy = true
+ }
+
+ if min > max {
+ return nil, p.getErr(ErrInvalidRepeatSize)
+ }
+
+ p.addConcatenate3(lazy, min, max)
+ }
+
+ ContinueOuterScan:
+ }
+
+BreakOuterScan:
+ ;
+
+ if !p.emptyStack() {
+ return nil, p.getErr(ErrMissingParen)
+ }
+
+ if err := p.addGroup(); err != nil {
+ return nil, err
+ }
+
+ return p.unit, nil
+
+}
+
+/*
+ * Simple parsing for replacement patterns
+ */
+func (p *parser) scanReplacement() (*regexNode, error) {
+ var c, startpos int
+
+ p.concatenation = newRegexNode(ntConcatenate, p.options)
+
+ for {
+ c = p.charsRight()
+ if c == 0 {
+ break
+ }
+
+ startpos = p.textpos()
+
+ for c > 0 && p.rightChar(0) != '$' {
+ p.moveRight(1)
+ c--
+ }
+
+ p.addToConcatenate(startpos, p.textpos()-startpos, true)
+
+ if c > 0 {
+ if p.moveRightGetChar() == '$' {
+ n, err := p.scanDollar()
+ if err != nil {
+ return nil, err
+ }
+ p.addUnitNode(n)
+ }
+ p.addConcatenate()
+ }
+ }
+
+ return p.concatenation, nil
+}
+
+/*
+ * Scans $ patterns recognized within replacement patterns
+ */
+func (p *parser) scanDollar() (*regexNode, error) {
+ if p.charsRight() == 0 {
+ return newRegexNodeCh(ntOne, p.options, '$'), nil
+ }
+
+ ch := p.rightChar(0)
+ angled := false
+ backpos := p.textpos()
+ lastEndPos := backpos
+
+ // Note angle
+
+ if ch == '{' && p.charsRight() > 1 {
+ angled = true
+ p.moveRight(1)
+ ch = p.rightChar(0)
+ }
+
+ // Try to parse backreference: \1 or \{1} or \{cap}
+
+ if ch >= '0' && ch <= '9' {
+ if !angled && p.useOptionE() {
+ capnum := -1
+ newcapnum := int(ch - '0')
+ p.moveRight(1)
+ if p.isCaptureSlot(newcapnum) {
+ capnum = newcapnum
+ lastEndPos = p.textpos()
+ }
+
+ for p.charsRight() > 0 {
+ ch = p.rightChar(0)
+ if ch < '0' || ch > '9' {
+ break
+ }
+ digit := int(ch - '0')
+ if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
+ return nil, p.getErr(ErrCaptureGroupOutOfRange)
+ }
+
+ newcapnum = newcapnum*10 + digit
+
+ p.moveRight(1)
+ if p.isCaptureSlot(newcapnum) {
+ capnum = newcapnum
+ lastEndPos = p.textpos()
+ }
+ }
+ p.textto(lastEndPos)
+ if capnum >= 0 {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ } else {
+ capnum, err := p.scanDecimal()
+ if err != nil {
+ return nil, err
+ }
+ if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ }
+ }
+ } else if angled && IsWordChar(ch) {
+ capname := p.scanCapname()
+
+ if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
+ if p.isCaptureName(capname) {
+ return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
+ }
+ }
+ } else if !angled {
+ capnum := 1
+
+ switch ch {
+ case '$':
+ p.moveRight(1)
+ return newRegexNodeCh(ntOne, p.options, '$'), nil
+ case '&':
+ capnum = 0
+ case '`':
+ capnum = replaceLeftPortion
+ case '\'':
+ capnum = replaceRightPortion
+ case '+':
+ capnum = replaceLastGroup
+ case '_':
+ capnum = replaceWholeString
+ }
+
+ if capnum != 1 {
+ p.moveRight(1)
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ }
+
+ // unrecognized $: literalize
+
+ p.textto(backpos)
+ return newRegexNodeCh(ntOne, p.options, '$'), nil
+}
+
+// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
+// a RegexNode for the type of group scanned, or nil if the group
+// simply changed options (?cimsx-cimsx) or was a comment (#...).
+func (p *parser) scanGroupOpen() (*regexNode, error) {
+ var ch rune
+ var nt nodeType
+ var err error
+ close := '>'
+ start := p.textpos()
+
+ // just return a RegexNode if we have:
+ // 1. "(" followed by nothing
+ // 2. "(x" where x != ?
+ // 3. "(?)"
+ if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
+ if p.useOptionN() || p.ignoreNextParen {
+ p.ignoreNextParen = false
+ return newRegexNode(ntGroup, p.options), nil
+ }
+ return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
+ }
+
+ p.moveRight(1)
+
+ for {
+ if p.charsRight() == 0 {
+ break
+ }
+
+ switch ch = p.moveRightGetChar(); ch {
+ case ':':
+ nt = ntGroup
+
+ case '=':
+ p.options &= ^RightToLeft
+ nt = ntRequire
+
+ case '!':
+ p.options &= ^RightToLeft
+ nt = ntPrevent
+
+ case '>':
+ nt = ntGreedy
+
+ case '\'':
+ close = '\''
+ fallthrough
+
+ case '<':
+ if p.charsRight() == 0 {
+ goto BreakRecognize
+ }
+
+ switch ch = p.moveRightGetChar(); ch {
+ case '=':
+ if close == '\'' {
+ goto BreakRecognize
+ }
+
+ p.options |= RightToLeft
+ nt = ntRequire
+
+ case '!':
+ if close == '\'' {
+ goto BreakRecognize
+ }
+
+ p.options |= RightToLeft
+ nt = ntPrevent
+
+ default:
+ p.moveLeft()
+ capnum := -1
+ uncapnum := -1
+ proceed := false
+
+ // grab part before -
+
+ if ch >= '0' && ch <= '9' {
+ if capnum, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+
+ if !p.isCaptureSlot(capnum) {
+ capnum = -1
+ }
+
+ // check if we have bogus characters after the number
+ if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ if capnum == 0 {
+ return nil, p.getErr(ErrCapNumNotZero)
+ }
+ } else if IsWordChar(ch) {
+ capname := p.scanCapname()
+
+ if p.isCaptureName(capname) {
+ capnum = p.captureSlotFromName(capname)
+ }
+
+ // check if we have bogus character after the name
+ if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ } else if ch == '-' {
+ proceed = true
+ } else {
+ // bad group name - starts with something other than a word character and isn't a number
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+
+ // grab part after - if any
+
+ if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
+ p.moveRight(1)
+
+ //no more chars left, no closing char, etc
+ if p.charsRight() == 0 {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+
+ ch = p.rightChar(0)
+ if ch >= '0' && ch <= '9' {
+ if uncapnum, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+
+ if !p.isCaptureSlot(uncapnum) {
+ return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
+ }
+
+ // check if we have bogus characters after the number
+ if p.charsRight() > 0 && p.rightChar(0) != close {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ } else if IsWordChar(ch) {
+ uncapname := p.scanCapname()
+
+ if !p.isCaptureName(uncapname) {
+ return nil, p.getErr(ErrUndefinedNameRef, uncapname)
+ }
+ uncapnum = p.captureSlotFromName(uncapname)
+
+ // check if we have bogus character after the name
+ if p.charsRight() > 0 && p.rightChar(0) != close {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ } else {
+ // bad group name - starts with something other than a word character and isn't a number
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ }
+
+ // actually make the node
+
+ if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
+ return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
+ }
+ goto BreakRecognize
+ }
+
+ case '(':
+ // alternation construct (?(...) | )
+
+ parenPos := p.textpos()
+ if p.charsRight() > 0 {
+ ch = p.rightChar(0)
+
+ // check if the alternation condition is a backref
+ if ch >= '0' && ch <= '9' {
+ var capnum int
+ if capnum, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+ if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntTestref, p.options, capnum), nil
+ }
+ return nil, p.getErr(ErrUndefinedReference, capnum)
+ }
+
+ return nil, p.getErr(ErrMalformedReference, capnum)
+
+ } else if IsWordChar(ch) {
+ capname := p.scanCapname()
+
+ if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
+ return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
+ }
+ }
+ }
+ // not a backref
+ nt = ntTestgroup
+ p.textto(parenPos - 1) // jump to the start of the parentheses
+ p.ignoreNextParen = true // but make sure we don't try to capture the insides
+
+ charsRight := p.charsRight()
+ if charsRight >= 3 && p.rightChar(1) == '?' {
+ rightchar2 := p.rightChar(2)
+ // disallow comments in the condition
+ if rightchar2 == '#' {
+ return nil, p.getErr(ErrAlternationCantHaveComment)
+ }
+
+ // disallow named capture group (?<..>..) in the condition
+ if rightchar2 == '\'' {
+ return nil, p.getErr(ErrAlternationCantCapture)
+ }
+
+ if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
+ return nil, p.getErr(ErrAlternationCantCapture)
+ }
+ }
+
+ case 'P':
+ if p.useRE2() {
+ // support for P<name> syntax
+ if p.charsRight() < 3 {
+ goto BreakRecognize
+ }
+
+ ch = p.moveRightGetChar()
+ if ch != '<' {
+ goto BreakRecognize
+ }
+
+ ch = p.moveRightGetChar()
+ p.moveLeft()
+
+ if IsWordChar(ch) {
+ capnum := -1
+ capname := p.scanCapname()
+
+ if p.isCaptureName(capname) {
+ capnum = p.captureSlotFromName(capname)
+ }
+
+ // check if we have bogus character after the name
+ if p.charsRight() > 0 && p.rightChar(0) != '>' {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+
+ // actually make the node
+
+ if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
+ return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
+ }
+ goto BreakRecognize
+
+ } else {
+ // bad group name - starts with something other than a word character and isn't a number
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ }
+ // if we're not using RE2 compat mode then
+ // we just behave like normal
+ fallthrough
+
+ default:
+ p.moveLeft()
+
+ nt = ntGroup
+ // disallow options in the children of a testgroup node
+ if p.group.t != ntTestgroup {
+ p.scanOptions()
+ }
+ if p.charsRight() == 0 {
+ goto BreakRecognize
+ }
+
+ if ch = p.moveRightGetChar(); ch == ')' {
+ return nil, nil
+ }
+
+ if ch != ':' {
+ goto BreakRecognize
+ }
+
+ }
+
+ return newRegexNode(nt, p.options), nil
+ }
+
+BreakRecognize:
+
+ // break Recognize comes here
+
+ return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
+}
+
+// scans backslash specials and basics
+func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
+
+ if p.charsRight() == 0 {
+ return nil, p.getErr(ErrIllegalEndEscape)
+ }
+
+ switch ch := p.rightChar(0); ch {
+ case 'b', 'B', 'A', 'G', 'Z', 'z':
+ p.moveRight(1)
+ return newRegexNode(p.typeFromCode(ch), p.options), nil
+
+ case 'w':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, WordClass()), nil
+
+ case 'W':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
+
+ case 's':
+ p.moveRight(1)
+ if p.useOptionE() {
+ return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
+ } else if p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
+
+ case 'S':
+ p.moveRight(1)
+ if p.useOptionE() {
+ return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
+ } else if p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
+
+ case 'd':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
+
+ case 'D':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
+
+ case 'p', 'P':
+ p.moveRight(1)
+ prop, err := p.parseProperty()
+ if err != nil {
+ return nil, err
+ }
+ cc := &CharSet{}
+ cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
+ if p.useOptionI() {
+ cc.addLowercase()
+ }
+
+ return newRegexNodeSet(ntSet, p.options, cc), nil
+
+ default:
+ return p.scanBasicBackslash(scanOnly)
+ }
+}
+
+// Scans \-style backreferences and character escapes
+func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
+ if p.charsRight() == 0 {
+ return nil, p.getErr(ErrIllegalEndEscape)
+ }
+ angled := false
+ k := false
+ close := '\x00'
+
+ backpos := p.textpos()
+ ch := p.rightChar(0)
+
+ // Allow \k<foo> instead of \<foo>, which is now deprecated.
+
+ // According to ECMAScript specification, \k<name> is only parsed as a named group reference if
+ // there is at least one group name in the regexp.
+ // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
+ // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
+ if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
+ if p.charsRight() >= 2 {
+ p.moveRight(1)
+ ch = p.moveRightGetChar()
+
+ if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
+ angled = true
+ if ch == '\'' {
+ close = '\''
+ } else {
+ close = '>'
+ }
+ }
+ }
+
+ if !angled || p.charsRight() <= 0 {
+ return nil, p.getErr(ErrMalformedNameRef)
+ }
+
+ ch = p.rightChar(0)
+ k = true
+
+ } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
+ angled = true
+ if ch == '\'' {
+ close = '\''
+ } else {
+ close = '>'
+ }
+
+ p.moveRight(1)
+ ch = p.rightChar(0)
+ }
+
+ // Try to parse backreference: \<1> or \<cap>
+
+ if angled && ch >= '0' && ch <= '9' {
+ capnum, err := p.scanDecimal()
+ if err != nil {
+ return nil, err
+ }
+
+ if p.charsRight() > 0 && p.moveRightGetChar() == close {
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ return nil, p.getErr(ErrUndefinedBackRef, capnum)
+ }
+ } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
+ capnum, err := p.scanDecimal()
+ if err != nil {
+ return nil, err
+ }
+
+ if scanOnly {
+ return nil, nil
+ }
+
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ if capnum <= 9 && !p.useOptionE() {
+ return nil, p.getErr(ErrUndefinedBackRef, capnum)
+ }
+
+ } else if angled {
+ capname := p.scanCapname()
+
+ if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
+
+ if scanOnly {
+ return nil, nil
+ }
+
+ if p.isCaptureName(capname) {
+ return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
+ }
+ return nil, p.getErr(ErrUndefinedNameRef, capname)
+ } else {
+ if k {
+ return nil, p.getErr(ErrMalformedNameRef)
+ }
+ }
+ }
+
+ // Not backreference: must be char code
+
+ p.textto(backpos)
+ ch, err := p.scanCharEscape()
+ if err != nil {
+ return nil, err
+ }
+
+ if scanOnly {
+ return nil, nil
+ }
+
+ if p.useOptionI() {
+ ch = unicode.ToLower(ch)
+ }
+
+ return newRegexNodeCh(ntOne, p.options, ch), nil
+}
+
+// Scans X for \p{X} or \P{X}
+func (p *parser) parseProperty() (string, error) {
+ if p.charsRight() < 3 {
+ return "", p.getErr(ErrIncompleteSlashP)
+ }
+ ch := p.moveRightGetChar()
+ if ch != '{' {
+ return "", p.getErr(ErrMalformedSlashP)
+ }
+
+ startpos := p.textpos()
+ for p.charsRight() > 0 {
+ ch = p.moveRightGetChar()
+ if !(IsWordChar(ch) || ch == '-') {
+ p.moveLeft()
+ break
+ }
+ }
+ capname := string(p.pattern[startpos:p.textpos()])
+
+ if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
+ return "", p.getErr(ErrIncompleteSlashP)
+ }
+
+ if !isValidUnicodeCat(capname) {
+ return "", p.getErr(ErrUnknownSlashP, capname)
+ }
+
+ return capname, nil
+}
+
+// Returns ReNode type for zero-length assertions with a \ code.
+func (p *parser) typeFromCode(ch rune) nodeType {
+ switch ch {
+ case 'b':
+ if p.useOptionE() {
+ return ntECMABoundary
+ }
+ return ntBoundary
+ case 'B':
+ if p.useOptionE() {
+ return ntNonECMABoundary
+ }
+ return ntNonboundary
+ case 'A':
+ return ntBeginning
+ case 'G':
+ return ntStart
+ case 'Z':
+ return ntEndZ
+ case 'z':
+ return ntEnd
+ default:
+ return ntNothing
+ }
+}
+
+// Scans whitespace or x-mode comments.
+func (p *parser) scanBlank() error {
+ if p.useOptionX() {
+ for {
+ for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
+ p.moveRight(1)
+ }
+
+ if p.charsRight() == 0 {
+ break
+ }
+
+ if p.rightChar(0) == '#' {
+ for p.charsRight() > 0 && p.rightChar(0) != '\n' {
+ p.moveRight(1)
+ }
+ } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
+ p.rightChar(1) == '?' && p.rightChar(0) == '(' {
+ for p.charsRight() > 0 && p.rightChar(0) != ')' {
+ p.moveRight(1)
+ }
+ if p.charsRight() == 0 {
+ return p.getErr(ErrUnterminatedComment)
+ }
+ p.moveRight(1)
+ } else {
+ break
+ }
+ }
+ } else {
+ for {
+ if p.charsRight() < 3 || p.rightChar(2) != '#' ||
+ p.rightChar(1) != '?' || p.rightChar(0) != '(' {
+ return nil
+ }
+
+ for p.charsRight() > 0 && p.rightChar(0) != ')' {
+ p.moveRight(1)
+ }
+ if p.charsRight() == 0 {
+ return p.getErr(ErrUnterminatedComment)
+ }
+ p.moveRight(1)
+ }
+ }
+ return nil
+}
+
+func (p *parser) scanCapname() string {
+ startpos := p.textpos()
+
+ for p.charsRight() > 0 {
+ if !IsWordChar(p.moveRightGetChar()) {
+ p.moveLeft()
+ break
+ }
+ }
+
+ return string(p.pattern[startpos:p.textpos()])
+}
+
+//Scans contents of [] (not including []'s), and converts to a set.
+func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
+ ch := '\x00'
+ chPrev := '\x00'
+ inRange := false
+ firstChar := true
+ closed := false
+
+ var cc *CharSet
+ if !scanOnly {
+ cc = &CharSet{}
+ }
+
+ if p.charsRight() > 0 && p.rightChar(0) == '^' {
+ p.moveRight(1)
+ if !scanOnly {
+ cc.negate = true
+ }
+ }
+
+ for ; p.charsRight() > 0; firstChar = false {
+ fTranslatedChar := false
+ ch = p.moveRightGetChar()
+ if ch == ']' {
+ if !firstChar {
+ closed = true
+ break
+ } else if p.useOptionE() {
+ if !scanOnly {
+ cc.addRanges(NoneClass().ranges)
+ }
+ closed = true
+ break
+ }
+
+ } else if ch == '\\' && p.charsRight() > 0 {
+ switch ch = p.moveRightGetChar(); ch {
+ case 'D', 'd':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+ cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
+ }
+ continue
+
+ case 'S', 's':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+ cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
+ }
+ continue
+
+ case 'W', 'w':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+
+ cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
+ }
+ continue
+
+ case 'p', 'P':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+ prop, err := p.parseProperty()
+ if err != nil {
+ return nil, err
+ }
+ cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
+ } else {
+ p.parseProperty()
+ }
+
+ continue
+
+ case '-':
+ if !scanOnly {
+ cc.addRange(ch, ch)
+ }
+ continue
+
+ default:
+ p.moveLeft()
+ var err error
+ ch, err = p.scanCharEscape() // non-literal character
+ if err != nil {
+ return nil, err
+ }
+ fTranslatedChar = true
+ break // this break will only break out of the switch
+ }
+ } else if ch == '[' {
+ // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
+ // It currently doesn't do anything other than skip the whole thing!
+ if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
+ savePos := p.textpos()
+
+ p.moveRight(1)
+ negate := false
+ if p.charsRight() > 1 && p.rightChar(0) == '^' {
+ negate = true
+ p.moveRight(1)
+ }
+
+ nm := p.scanCapname() // snag the name
+ if !scanOnly && p.useRE2() {
+ // look up the name since these are valid for RE2
+ // add the group based on the name
+ if ok := cc.addNamedASCII(nm, negate); !ok {
+ return nil, p.getErr(ErrInvalidCharRange)
+ }
+ }
+ if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
+ p.textto(savePos)
+ } else if p.useRE2() {
+ // move on
+ continue
+ }
+ }
+ }
+
+ if inRange {
+ inRange = false
+ if !scanOnly {
+ if ch == '[' && !fTranslatedChar && !firstChar {
+ // We thought we were in a range, but we're actually starting a subtraction.
+ // In that case, we'll add chPrev to our char class, skip the opening [, and
+ // scan the new character class recursively.
+ cc.addChar(chPrev)
+ sub, err := p.scanCharSet(caseInsensitive, false)
+ if err != nil {
+ return nil, err
+ }
+ cc.addSubtraction(sub)
+
+ if p.charsRight() > 0 && p.rightChar(0) != ']' {
+ return nil, p.getErr(ErrSubtractionMustBeLast)
+ }
+ } else {
+ // a regular range, like a-z
+ if chPrev > ch {
+ return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
+ }
+ cc.addRange(chPrev, ch)
+ }
+ }
+ } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
+ // this could be the start of a range
+ chPrev = ch
+ inRange = true
+ p.moveRight(1)
+ } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
+ // we aren't in a range, and now there is a subtraction. Usually this happens
+ // only when a subtraction follows a range, like [a-z-[b]]
+ if !scanOnly {
+ p.moveRight(1)
+ sub, err := p.scanCharSet(caseInsensitive, false)
+ if err != nil {
+ return nil, err
+ }
+ cc.addSubtraction(sub)
+
+ if p.charsRight() > 0 && p.rightChar(0) != ']' {
+ return nil, p.getErr(ErrSubtractionMustBeLast)
+ }
+ } else {
+ p.moveRight(1)
+ p.scanCharSet(caseInsensitive, true)
+ }
+ } else {
+ if !scanOnly {
+ cc.addRange(ch, ch)
+ }
+ }
+ }
+
+ if !closed {
+ return nil, p.getErr(ErrUnterminatedBracket)
+ }
+
+ if !scanOnly && caseInsensitive {
+ cc.addLowercase()
+ }
+
+ return cc, nil
+}
+
+// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
+func (p *parser) scanDecimal() (int, error) {
+ i := 0
+ var d int
+
+ for p.charsRight() > 0 {
+ d = int(p.rightChar(0) - '0')
+ if d < 0 || d > 9 {
+ break
+ }
+ p.moveRight(1)
+
+ if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
+ return 0, p.getErr(ErrCaptureGroupOutOfRange)
+ }
+
+ i *= 10
+ i += d
+ }
+
+ return int(i), nil
+}
+
+// Returns true for options allowed only at the top level
+func isOnlyTopOption(option RegexOptions) bool {
+ return option == RightToLeft || option == ECMAScript || option == RE2
+}
+
+// Scans cimsx-cimsx option string, stops at the first unrecognized char.
+func (p *parser) scanOptions() {
+
+ for off := false; p.charsRight() > 0; p.moveRight(1) {
+ ch := p.rightChar(0)
+
+ if ch == '-' {
+ off = true
+ } else if ch == '+' {
+ off = false
+ } else {
+ option := optionFromCode(ch)
+ if option == 0 || isOnlyTopOption(option) {
+ return
+ }
+
+ if off {
+ p.options &= ^option
+ } else {
+ p.options |= option
+ }
+ }
+ }
+}
+
+// Scans \ code for escape codes that map to single unicode chars.
+func (p *parser) scanCharEscape() (r rune, err error) {
+
+ ch := p.moveRightGetChar()
+
+ if ch >= '0' && ch <= '7' {
+ p.moveLeft()
+ return p.scanOctal(), nil
+ }
+
+ pos := p.textpos()
+
+ switch ch {
+ case 'x':
+ // support for \x{HEX} syntax from Perl and PCRE
+ if p.charsRight() > 0 && p.rightChar(0) == '{' {
+ if p.useOptionE() {
+ return ch, nil
+ }
+ p.moveRight(1)
+ return p.scanHexUntilBrace()
+ } else {
+ r, err = p.scanHex(2)
+ }
+ case 'u':
+ // ECMAscript suppot \u{HEX} only if `u` is also set
+ if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
+ p.moveRight(1)
+ return p.scanHexUntilBrace()
+ } else {
+ r, err = p.scanHex(4)
+ }
+ case 'a':
+ return '\u0007', nil
+ case 'b':
+ return '\b', nil
+ case 'e':
+ return '\u001B', nil
+ case 'f':
+ return '\f', nil
+ case 'n':
+ return '\n', nil
+ case 'r':
+ return '\r', nil
+ case 't':
+ return '\t', nil
+ case 'v':
+ return '\u000B', nil
+ case 'c':
+ r, err = p.scanControl()
+ default:
+ if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
+ return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
+ }
+ return ch, nil
+ }
+ if err != nil && p.useOptionE() {
+ p.textto(pos)
+ return ch, nil
+ }
+ return
+}
+
+// Grabs and converts an ascii control character
+func (p *parser) scanControl() (rune, error) {
+ if p.charsRight() <= 0 {
+ return 0, p.getErr(ErrMissingControl)
+ }
+
+ ch := p.moveRightGetChar()
+
+ // \ca interpreted as \cA
+
+ if ch >= 'a' && ch <= 'z' {
+ ch = (ch - ('a' - 'A'))
+ }
+ ch = (ch - '@')
+ if ch >= 0 && ch < ' ' {
+ return ch, nil
+ }
+
+ return 0, p.getErr(ErrUnrecognizedControl)
+
+}
+
+// Scan hex digits until we hit a closing brace.
+// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
+func (p *parser) scanHexUntilBrace() (rune, error) {
+ // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
+ // so we can enforce that
+ i := 0
+ hasContent := false
+
+ for p.charsRight() > 0 {
+ ch := p.moveRightGetChar()
+ if ch == '}' {
+ // hit our close brace, we're done here
+ // prevent \x{}
+ if !hasContent {
+ return 0, p.getErr(ErrTooFewHex)
+ }
+ return rune(i), nil
+ }
+ hasContent = true
+ // no brace needs to be hex digit
+ d := hexDigit(ch)
+ if d < 0 {
+ return 0, p.getErr(ErrMissingBrace)
+ }
+
+ i *= 0x10
+ i += d
+
+ if i > unicode.MaxRune {
+ return 0, p.getErr(ErrInvalidHex)
+ }
+ }
+
+ // we only make it here if we run out of digits without finding the brace
+ return 0, p.getErr(ErrMissingBrace)
+}
+
+// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
+func (p *parser) scanHex(c int) (rune, error) {
+
+ i := 0
+
+ if p.charsRight() >= c {
+ for c > 0 {
+ d := hexDigit(p.moveRightGetChar())
+ if d < 0 {
+ break
+ }
+ i *= 0x10
+ i += d
+ c--
+ }
+ }
+
+ if c > 0 {
+ return 0, p.getErr(ErrTooFewHex)
+ }
+
+ return rune(i), nil
+}
+
+// Returns n <= 0xF for a hex digit.
+func hexDigit(ch rune) int {
+
+ if d := uint(ch - '0'); d <= 9 {
+ return int(d)
+ }
+
+ if d := uint(ch - 'a'); d <= 5 {
+ return int(d + 0xa)
+ }
+
+ if d := uint(ch - 'A'); d <= 5 {
+ return int(d + 0xa)
+ }
+
+ return -1
+}
+
+// Scans up to three octal digits (stops before exceeding 0377).
+func (p *parser) scanOctal() rune {
+ // Consume octal chars only up to 3 digits and value 0377
+
+ c := 3
+
+ if c > p.charsRight() {
+ c = p.charsRight()
+ }
+
+ //we know the first char is good because the caller had to check
+ i := 0
+ d := int(p.rightChar(0) - '0')
+ for c > 0 && d <= 7 && d >= 0 {
+ if i >= 0x20 && p.useOptionE() {
+ break
+ }
+ i *= 8
+ i += d
+ c--
+
+ p.moveRight(1)
+ if !p.rightMost() {
+ d = int(p.rightChar(0) - '0')
+ }
+ }
+
+ // Octal codes only go up to 255. Any larger and the behavior that Perl follows
+ // is simply to truncate the high bits.
+ i &= 0xFF
+
+ return rune(i)
+}
+
+// Returns the current parsing position.
+func (p *parser) textpos() int {
+ return p.currentPos
+}
+
+// Zaps to a specific parsing position.
+func (p *parser) textto(pos int) {
+ p.currentPos = pos
+}
+
+// Returns the char at the right of the current parsing position and advances to the right.
+func (p *parser) moveRightGetChar() rune {
+ ch := p.pattern[p.currentPos]
+ p.currentPos++
+ return ch
+}
+
+// Moves the current position to the right.
+func (p *parser) moveRight(i int) {
+ // default would be 1
+ p.currentPos += i
+}
+
+// Moves the current parsing position one to the left.
+func (p *parser) moveLeft() {
+ p.currentPos--
+}
+
+// Returns the char left of the current parsing position.
+func (p *parser) charAt(i int) rune {
+ return p.pattern[i]
+}
+
+// Returns the char i chars right of the current parsing position.
+func (p *parser) rightChar(i int) rune {
+ // default would be 0
+ return p.pattern[p.currentPos+i]
+}
+
+// Number of characters to the right of the current parsing position.
+func (p *parser) charsRight() int {
+ return len(p.pattern) - p.currentPos
+}
+
+func (p *parser) rightMost() bool {
+ return p.currentPos == len(p.pattern)
+}
+
+// Looks up the slot number for a given name
+func (p *parser) captureSlotFromName(capname string) int {
+ return p.capnames[capname]
+}
+
+// True if the capture slot was noted
+func (p *parser) isCaptureSlot(i int) bool {
+ if p.caps != nil {
+ _, ok := p.caps[i]
+ return ok
+ }
+
+ return (i >= 0 && i < p.capsize)
+}
+
+// Looks up the slot number for a given name
+func (p *parser) isCaptureName(capname string) bool {
+ if p.capnames == nil {
+ return false
+ }
+
+ _, ok := p.capnames[capname]
+ return ok
+}
+
+// option shortcuts
+
+// True if N option disabling '(' autocapture is on.
+func (p *parser) useOptionN() bool {
+ return (p.options & ExplicitCapture) != 0
+}
+
+// True if I option enabling case-insensitivity is on.
+func (p *parser) useOptionI() bool {
+ return (p.options & IgnoreCase) != 0
+}
+
+// True if M option altering meaning of $ and ^ is on.
+func (p *parser) useOptionM() bool {
+ return (p.options & Multiline) != 0
+}
+
+// True if S option altering meaning of . is on.
+func (p *parser) useOptionS() bool {
+ return (p.options & Singleline) != 0
+}
+
+// True if X option enabling whitespace/comment mode is on.
+func (p *parser) useOptionX() bool {
+ return (p.options & IgnorePatternWhitespace) != 0
+}
+
+// True if E option enabling ECMAScript behavior on.
+func (p *parser) useOptionE() bool {
+ return (p.options & ECMAScript) != 0
+}
+
+// true to use RE2 compatibility parsing behavior.
+func (p *parser) useRE2() bool {
+ return (p.options & RE2) != 0
+}
+
+// True if U option enabling ECMAScript's Unicode behavior on.
+func (p *parser) useOptionU() bool {
+ return (p.options & Unicode) != 0
+}
+
+// True if options stack is empty.
+func (p *parser) emptyOptionsStack() bool {
+ return len(p.optionsStack) == 0
+}
+
+// Finish the current quantifiable (when a quantifier is not found or is not possible)
+func (p *parser) addConcatenate() {
+ // The first (| inside a Testgroup group goes directly to the group
+ p.concatenation.addChild(p.unit)
+ p.unit = nil
+}
+
+// Finish the current quantifiable (when a quantifier is found)
+func (p *parser) addConcatenate3(lazy bool, min, max int) {
+ p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
+ p.unit = nil
+}
+
+// Sets the current unit to a single char node
+func (p *parser) addUnitOne(ch rune) {
+ if p.useOptionI() {
+ ch = unicode.ToLower(ch)
+ }
+
+ p.unit = newRegexNodeCh(ntOne, p.options, ch)
+}
+
+// Sets the current unit to a single inverse-char node
+func (p *parser) addUnitNotone(ch rune) {
+ if p.useOptionI() {
+ ch = unicode.ToLower(ch)
+ }
+
+ p.unit = newRegexNodeCh(ntNotone, p.options, ch)
+}
+
+// Sets the current unit to a single set node
+func (p *parser) addUnitSet(set *CharSet) {
+ p.unit = newRegexNodeSet(ntSet, p.options, set)
+}
+
+// Sets the current unit to a subtree
+func (p *parser) addUnitNode(node *regexNode) {
+ p.unit = node
+}
+
+// Sets the current unit to an assertion of the specified type
+func (p *parser) addUnitType(t nodeType) {
+ p.unit = newRegexNode(t, p.options)
+}
+
+// Finish the current group (in response to a ')' or end)
+func (p *parser) addGroup() error {
+ if p.group.t == ntTestgroup || p.group.t == ntTestref {
+ p.group.addChild(p.concatenation.reverseLeft())
+ if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
+ return p.getErr(ErrTooManyAlternates)
+ }
+ } else {
+ p.alternation.addChild(p.concatenation.reverseLeft())
+ p.group.addChild(p.alternation)
+ }
+
+ p.unit = p.group
+ return nil
+}
+
+// Pops the option stack, but keeps the current options unchanged.
+func (p *parser) popKeepOptions() {
+ lastIdx := len(p.optionsStack) - 1
+ p.optionsStack = p.optionsStack[:lastIdx]
+}
+
+// Recalls options from the stack.
+func (p *parser) popOptions() {
+ lastIdx := len(p.optionsStack) - 1
+ // get the last item on the stack and then remove it by reslicing
+ p.options = p.optionsStack[lastIdx]
+ p.optionsStack = p.optionsStack[:lastIdx]
+}
+
+// Saves options on a stack.
+func (p *parser) pushOptions() {
+ p.optionsStack = append(p.optionsStack, p.options)
+}
+
+// Add a string to the last concatenate.
+func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
+ var node *regexNode
+
+ if cch == 0 {
+ return
+ }
+
+ if cch > 1 {
+ str := make([]rune, cch)
+ copy(str, p.pattern[pos:pos+cch])
+
+ if p.useOptionI() && !isReplacement {
+ // We do the ToLower character by character for consistency. With surrogate chars, doing
+ // a ToLower on the entire string could actually change the surrogate pair. This is more correct
+ // linguistically, but since Regex doesn't support surrogates, it's more important to be
+ // consistent.
+ for i := 0; i < len(str); i++ {
+ str[i] = unicode.ToLower(str[i])
+ }
+ }
+
+ node = newRegexNodeStr(ntMulti, p.options, str)
+ } else {
+ ch := p.charAt(pos)
+
+ if p.useOptionI() && !isReplacement {
+ ch = unicode.ToLower(ch)
+ }
+
+ node = newRegexNodeCh(ntOne, p.options, ch)
+ }
+
+ p.concatenation.addChild(node)
+}
+
+// Push the parser state (in response to an open paren)
+func (p *parser) pushGroup() {
+ p.group.next = p.stack
+ p.alternation.next = p.group
+ p.concatenation.next = p.alternation
+ p.stack = p.concatenation
+}
+
+// Remember the pushed state (in response to a ')')
+func (p *parser) popGroup() error {
+ p.concatenation = p.stack
+ p.alternation = p.concatenation.next
+ p.group = p.alternation.next
+ p.stack = p.group.next
+
+ // The first () inside a Testgroup group goes directly to the group
+ if p.group.t == ntTestgroup && len(p.group.children) == 0 {
+ if p.unit == nil {
+ return p.getErr(ErrConditionalExpression)
+ }
+
+ p.group.addChild(p.unit)
+ p.unit = nil
+ }
+ return nil
+}
+
+// True if the group stack is empty.
+func (p *parser) emptyStack() bool {
+ return p.stack == nil
+}
+
+// Start a new round for the parser state (in response to an open paren or string start)
+func (p *parser) startGroup(openGroup *regexNode) {
+ p.group = openGroup
+ p.alternation = newRegexNode(ntAlternate, p.options)
+ p.concatenation = newRegexNode(ntConcatenate, p.options)
+}
+
+// Finish the current concatenation (in response to a |)
+func (p *parser) addAlternate() {
+ // The | parts inside a Testgroup group go directly to the group
+
+ if p.group.t == ntTestgroup || p.group.t == ntTestref {
+ p.group.addChild(p.concatenation.reverseLeft())
+ } else {
+ p.alternation.addChild(p.concatenation.reverseLeft())
+ }
+
+ p.concatenation = newRegexNode(ntConcatenate, p.options)
+}
+
+// For categorizing ascii characters.
+
+const (
+ Q byte = 5 // quantifier
+ S = 4 // ordinary stopper
+ Z = 3 // ScanBlank stopper
+ X = 2 // whitespace
+ E = 1 // should be escaped
+)
+
+var _category = []byte{
+ //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
+ //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
+ //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
+}
+
+func isSpace(ch rune) bool {
+ return (ch <= ' ' && _category[ch] == X)
+}
+
+// Returns true for those characters that terminate a string of ordinary chars.
+func isSpecial(ch rune) bool {
+ return (ch <= '|' && _category[ch] >= S)
+}
+
+// Returns true for those characters that terminate a string of ordinary chars.
+func isStopperX(ch rune) bool {
+ return (ch <= '|' && _category[ch] >= X)
+}
+
+// Returns true for those characters that begin a quantifier.
+func isQuantifier(ch rune) bool {
+ return (ch <= '{' && _category[ch] >= Q)
+}
+
+func (p *parser) isTrueQuantifier() bool {
+ nChars := p.charsRight()
+ if nChars == 0 {
+ return false
+ }
+
+ startpos := p.textpos()
+ ch := p.charAt(startpos)
+ if ch != '{' {
+ return ch <= '{' && _category[ch] >= Q
+ }
+
+ //UGLY: this is ugly -- the original code was ugly too
+ pos := startpos
+ for {
+ nChars--
+ if nChars <= 0 {
+ break
+ }
+ pos++
+ ch = p.charAt(pos)
+ if ch < '0' || ch > '9' {
+ break
+ }
+ }
+
+ if nChars == 0 || pos-startpos == 1 {
+ return false
+ }
+ if ch == '}' {
+ return true
+ }
+ if ch != ',' {
+ return false
+ }
+ for {
+ nChars--
+ if nChars <= 0 {
+ break
+ }
+ pos++
+ ch = p.charAt(pos)
+ if ch < '0' || ch > '9' {
+ break
+ }
+ }
+
+ return nChars > 0 && ch == '}'
+}