summaryrefslogtreecommitdiff
path: root/vendor/github.com/dlclark/regexp2/syntax
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
committerMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
commitc6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/dlclark/regexp2/syntax
parent0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
downloadjbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz
Added vendor lock on depsHEADmaster
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax')
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/charclass.go865
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/code.go274
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/escape.go94
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/fuzz.go20
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/parser.go2251
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/prefix.go896
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/replacerdata.go87
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/tree.go654
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/writer.go500
9 files changed, 5641 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/charclass.go b/vendor/github.com/dlclark/regexp2/syntax/charclass.go
new file mode 100644
index 0000000..6881a0e
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/charclass.go
@@ -0,0 +1,865 @@
+package syntax
+
+import (
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "sort"
+ "unicode"
+ "unicode/utf8"
+)
+
+// CharSet combines start-end rune ranges and unicode categories representing a set of characters
+type CharSet struct {
+ ranges []singleRange
+ categories []category
+ sub *CharSet //optional subtractor
+ negate bool
+ anything bool
+}
+
+type category struct {
+ negate bool
+ cat string
+}
+
+type singleRange struct {
+ first rune
+ last rune
+}
+
+const (
+ spaceCategoryText = " "
+ wordCategoryText = "W"
+)
+
+var (
+ ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
+ ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
+ ecmaDigit = []rune{0x0030, 0x003a}
+
+ re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
+)
+
+var (
+ AnyClass = getCharSetFromOldString([]rune{0}, false)
+ ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
+ NoneClass = getCharSetFromOldString(nil, false)
+ ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
+ NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
+ ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
+ NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
+ ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
+ NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
+
+ WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
+ NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
+ SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
+ NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
+ DigitClass = getCharSetFromCategoryString(false, false, "Nd")
+ NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
+
+ RE2SpaceClass = getCharSetFromOldString(re2Space, false)
+ NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
+)
+
+var unicodeCategories = func() map[string]*unicode.RangeTable {
+ retVal := make(map[string]*unicode.RangeTable)
+ for k, v := range unicode.Scripts {
+ retVal[k] = v
+ }
+ for k, v := range unicode.Categories {
+ retVal[k] = v
+ }
+ for k, v := range unicode.Properties {
+ retVal[k] = v
+ }
+ return retVal
+}()
+
+func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
+ if negateCat && negateSet {
+ panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
+ }
+
+ c := CharSet{negate: negateSet}
+
+ c.categories = make([]category, len(cats))
+ for i, cat := range cats {
+ c.categories[i] = category{cat: cat, negate: negateCat}
+ }
+ return func() *CharSet {
+ //make a copy each time
+ local := c
+ //return that address
+ return &local
+ }
+}
+
+func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
+ c := CharSet{}
+ if len(setText) > 0 {
+ fillFirst := false
+ l := len(setText)
+ if negate {
+ if setText[0] == 0 {
+ setText = setText[1:]
+ } else {
+ l++
+ fillFirst = true
+ }
+ }
+
+ if l%2 == 0 {
+ c.ranges = make([]singleRange, l/2)
+ } else {
+ c.ranges = make([]singleRange, l/2+1)
+ }
+
+ first := true
+ if fillFirst {
+ c.ranges[0] = singleRange{first: 0}
+ first = false
+ }
+
+ i := 0
+ for _, r := range setText {
+ if first {
+ // lower bound in a new range
+ c.ranges[i] = singleRange{first: r}
+ first = false
+ } else {
+ c.ranges[i].last = r - 1
+ i++
+ first = true
+ }
+ }
+ if !first {
+ c.ranges[i].last = utf8.MaxRune
+ }
+ }
+
+ return func() *CharSet {
+ local := c
+ return &local
+ }
+}
+
+// Copy makes a deep copy to prevent accidental mutation of a set
+func (c CharSet) Copy() CharSet {
+ ret := CharSet{
+ anything: c.anything,
+ negate: c.negate,
+ }
+
+ ret.ranges = append(ret.ranges, c.ranges...)
+ ret.categories = append(ret.categories, c.categories...)
+
+ if c.sub != nil {
+ sub := c.sub.Copy()
+ ret.sub = &sub
+ }
+
+ return ret
+}
+
+// gets a human-readable description for a set string
+func (c CharSet) String() string {
+ buf := &bytes.Buffer{}
+ buf.WriteRune('[')
+
+ if c.IsNegated() {
+ buf.WriteRune('^')
+ }
+
+ for _, r := range c.ranges {
+
+ buf.WriteString(CharDescription(r.first))
+ if r.first != r.last {
+ if r.last-r.first != 1 {
+ //groups that are 1 char apart skip the dash
+ buf.WriteRune('-')
+ }
+ buf.WriteString(CharDescription(r.last))
+ }
+ }
+
+ for _, c := range c.categories {
+ buf.WriteString(c.String())
+ }
+
+ if c.sub != nil {
+ buf.WriteRune('-')
+ buf.WriteString(c.sub.String())
+ }
+
+ buf.WriteRune(']')
+
+ return buf.String()
+}
+
+// mapHashFill converts a charset into a buffer for use in maps
+func (c CharSet) mapHashFill(buf *bytes.Buffer) {
+ if c.negate {
+ buf.WriteByte(0)
+ } else {
+ buf.WriteByte(1)
+ }
+
+ binary.Write(buf, binary.LittleEndian, len(c.ranges))
+ binary.Write(buf, binary.LittleEndian, len(c.categories))
+ for _, r := range c.ranges {
+ buf.WriteRune(r.first)
+ buf.WriteRune(r.last)
+ }
+ for _, ct := range c.categories {
+ buf.WriteString(ct.cat)
+ if ct.negate {
+ buf.WriteByte(1)
+ } else {
+ buf.WriteByte(0)
+ }
+ }
+
+ if c.sub != nil {
+ c.sub.mapHashFill(buf)
+ }
+}
+
+// CharIn returns true if the rune is in our character set (either ranges or categories).
+// It handles negations and subtracted sub-charsets.
+func (c CharSet) CharIn(ch rune) bool {
+ val := false
+ // in s && !s.subtracted
+
+ //check ranges
+ for _, r := range c.ranges {
+ if ch < r.first {
+ continue
+ }
+ if ch <= r.last {
+ val = true
+ break
+ }
+ }
+
+ //check categories if we haven't already found a range
+ if !val && len(c.categories) > 0 {
+ for _, ct := range c.categories {
+ // special categories...then unicode
+ if ct.cat == spaceCategoryText {
+ if unicode.IsSpace(ch) {
+ // we found a space so we're done
+ // negate means this is a "bad" thing
+ val = !ct.negate
+ break
+ } else if ct.negate {
+ val = true
+ break
+ }
+ } else if ct.cat == wordCategoryText {
+ if IsWordChar(ch) {
+ val = !ct.negate
+ break
+ } else if ct.negate {
+ val = true
+ break
+ }
+ } else if unicode.Is(unicodeCategories[ct.cat], ch) {
+ // if we're in this unicode category then we're done
+ // if negate=true on this category then we "failed" our test
+ // otherwise we're good that we found it
+ val = !ct.negate
+ break
+ } else if ct.negate {
+ val = true
+ break
+ }
+ }
+ }
+
+ // negate the whole char set
+ if c.negate {
+ val = !val
+ }
+
+ // get subtracted recurse
+ if val && c.sub != nil {
+ val = !c.sub.CharIn(ch)
+ }
+
+ //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
+ return val
+}
+
+func (c category) String() string {
+ switch c.cat {
+ case spaceCategoryText:
+ if c.negate {
+ return "\\S"
+ }
+ return "\\s"
+ case wordCategoryText:
+ if c.negate {
+ return "\\W"
+ }
+ return "\\w"
+ }
+ if _, ok := unicodeCategories[c.cat]; ok {
+
+ if c.negate {
+ return "\\P{" + c.cat + "}"
+ }
+ return "\\p{" + c.cat + "}"
+ }
+ return "Unknown category: " + c.cat
+}
+
+// CharDescription Produces a human-readable description for a single character.
+func CharDescription(ch rune) string {
+ /*if ch == '\\' {
+ return "\\\\"
+ }
+
+ if ch > ' ' && ch <= '~' {
+ return string(ch)
+ } else if ch == '\n' {
+ return "\\n"
+ } else if ch == ' ' {
+ return "\\ "
+ }*/
+
+ b := &bytes.Buffer{}
+ escape(b, ch, false) //fmt.Sprintf("%U", ch)
+ return b.String()
+}
+
+// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
+// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
+// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
+// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
+func IsWordChar(r rune) bool {
+ //"L", "Mn", "Nd", "Pc"
+ return unicode.In(r,
+ unicode.Categories["L"], unicode.Categories["Mn"],
+ unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
+ //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
+}
+
+func IsECMAWordChar(r rune) bool {
+ return unicode.In(r,
+ unicode.Categories["L"], unicode.Categories["Mn"],
+ unicode.Categories["Nd"], unicode.Categories["Pc"])
+
+ //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
+}
+
+// SingletonChar will return the char from the first range without validation.
+// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
+func (c CharSet) SingletonChar() rune {
+ return c.ranges[0].first
+}
+
+func (c CharSet) IsSingleton() bool {
+ return !c.negate && //negated is multiple chars
+ len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
+ c.sub == nil && // subtraction means we've got multiple chars
+ c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
+}
+
+func (c CharSet) IsSingletonInverse() bool {
+ return c.negate && //same as above, but requires negated
+ len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
+ c.sub == nil && // subtraction means we've got multiple chars
+ c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
+}
+
+func (c CharSet) IsMergeable() bool {
+ return !c.IsNegated() && !c.HasSubtraction()
+}
+
+func (c CharSet) IsNegated() bool {
+ return c.negate
+}
+
+func (c CharSet) HasSubtraction() bool {
+ return c.sub != nil
+}
+
+func (c CharSet) IsEmpty() bool {
+ return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
+}
+
+func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
+ if ecma {
+ if negate {
+ c.addRanges(NotECMADigitClass().ranges)
+ } else {
+ c.addRanges(ECMADigitClass().ranges)
+ }
+ } else {
+ c.addCategories(category{cat: "Nd", negate: negate})
+ }
+}
+
+func (c *CharSet) addChar(ch rune) {
+ c.addRange(ch, ch)
+}
+
+func (c *CharSet) addSpace(ecma, re2, negate bool) {
+ if ecma {
+ if negate {
+ c.addRanges(NotECMASpaceClass().ranges)
+ } else {
+ c.addRanges(ECMASpaceClass().ranges)
+ }
+ } else if re2 {
+ if negate {
+ c.addRanges(NotRE2SpaceClass().ranges)
+ } else {
+ c.addRanges(RE2SpaceClass().ranges)
+ }
+ } else {
+ c.addCategories(category{cat: spaceCategoryText, negate: negate})
+ }
+}
+
+func (c *CharSet) addWord(ecma, negate bool) {
+ if ecma {
+ if negate {
+ c.addRanges(NotECMAWordClass().ranges)
+ } else {
+ c.addRanges(ECMAWordClass().ranges)
+ }
+ } else {
+ c.addCategories(category{cat: wordCategoryText, negate: negate})
+ }
+}
+
+// Add set ranges and categories into ours -- no deduping or anything
+func (c *CharSet) addSet(set CharSet) {
+ if c.anything {
+ return
+ }
+ if set.anything {
+ c.makeAnything()
+ return
+ }
+ // just append here to prevent double-canon
+ c.ranges = append(c.ranges, set.ranges...)
+ c.addCategories(set.categories...)
+ c.canonicalize()
+}
+
+func (c *CharSet) makeAnything() {
+ c.anything = true
+ c.categories = []category{}
+ c.ranges = AnyClass().ranges
+}
+
+func (c *CharSet) addCategories(cats ...category) {
+ // don't add dupes and remove positive+negative
+ if c.anything {
+ // if we've had a previous positive+negative group then
+ // just return, we're as broad as we can get
+ return
+ }
+
+ for _, ct := range cats {
+ found := false
+ for _, ct2 := range c.categories {
+ if ct.cat == ct2.cat {
+ if ct.negate != ct2.negate {
+ // oposite negations...this mean we just
+ // take us as anything and move on
+ c.makeAnything()
+ return
+ }
+ found = true
+ break
+ }
+ }
+
+ if !found {
+ c.categories = append(c.categories, ct)
+ }
+ }
+}
+
+// Merges new ranges to our own
+func (c *CharSet) addRanges(ranges []singleRange) {
+ if c.anything {
+ return
+ }
+ c.ranges = append(c.ranges, ranges...)
+ c.canonicalize()
+}
+
+// Merges everything but the new ranges into our own
+func (c *CharSet) addNegativeRanges(ranges []singleRange) {
+ if c.anything {
+ return
+ }
+
+ var hi rune
+
+ // convert incoming ranges into opposites, assume they are in order
+ for _, r := range ranges {
+ if hi < r.first {
+ c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
+ }
+ hi = r.last + 1
+ }
+
+ if hi < utf8.MaxRune {
+ c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
+ }
+
+ c.canonicalize()
+}
+
+func isValidUnicodeCat(catName string) bool {
+ _, ok := unicodeCategories[catName]
+ return ok
+}
+
+func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
+ if !isValidUnicodeCat(categoryName) {
+ // unknown unicode category, script, or property "blah"
+ panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
+
+ }
+
+ if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
+ // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
+ c.addCategories(
+ category{cat: "Ll", negate: negate},
+ category{cat: "Lu", negate: negate},
+ category{cat: "Lt", negate: negate})
+ }
+ c.addCategories(category{cat: categoryName, negate: negate})
+}
+
+func (c *CharSet) addSubtraction(sub *CharSet) {
+ c.sub = sub
+}
+
+func (c *CharSet) addRange(chMin, chMax rune) {
+ c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
+ c.canonicalize()
+}
+
+func (c *CharSet) addNamedASCII(name string, negate bool) bool {
+ var rs []singleRange
+
+ switch name {
+ case "alnum":
+ rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
+ case "alpha":
+ rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
+ case "ascii":
+ rs = []singleRange{singleRange{0, 0x7f}}
+ case "blank":
+ rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
+ case "cntrl":
+ rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
+ case "digit":
+ c.addDigit(false, negate, "")
+ case "graph":
+ rs = []singleRange{singleRange{'!', '~'}}
+ case "lower":
+ rs = []singleRange{singleRange{'a', 'z'}}
+ case "print":
+ rs = []singleRange{singleRange{' ', '~'}}
+ case "punct": //[!-/:-@[-`{-~]
+ rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
+ case "space":
+ c.addSpace(true, false, negate)
+ case "upper":
+ rs = []singleRange{singleRange{'A', 'Z'}}
+ case "word":
+ c.addWord(true, negate)
+ case "xdigit":
+ rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
+ default:
+ return false
+ }
+
+ if len(rs) > 0 {
+ if negate {
+ c.addNegativeRanges(rs)
+ } else {
+ c.addRanges(rs)
+ }
+ }
+
+ return true
+}
+
+type singleRangeSorter []singleRange
+
+func (p singleRangeSorter) Len() int { return len(p) }
+func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
+func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
+
+// Logic to reduce a character class to a unique, sorted form.
+func (c *CharSet) canonicalize() {
+ var i, j int
+ var last rune
+
+ //
+ // Find and eliminate overlapping or abutting ranges
+ //
+
+ if len(c.ranges) > 1 {
+ sort.Sort(singleRangeSorter(c.ranges))
+
+ done := false
+
+ for i, j = 1, 0; ; i++ {
+ for last = c.ranges[j].last; ; i++ {
+ if i == len(c.ranges) || last == utf8.MaxRune {
+ done = true
+ break
+ }
+
+ CurrentRange := c.ranges[i]
+ if CurrentRange.first > last+1 {
+ break
+ }
+
+ if last < CurrentRange.last {
+ last = CurrentRange.last
+ }
+ }
+
+ c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
+
+ j++
+
+ if done {
+ break
+ }
+
+ if j < i {
+ c.ranges[j] = c.ranges[i]
+ }
+ }
+
+ c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
+ }
+}
+
+// Adds to the class any lowercase versions of characters already
+// in the class. Used for case-insensitivity.
+func (c *CharSet) addLowercase() {
+ if c.anything {
+ return
+ }
+ toAdd := []singleRange{}
+ for i := 0; i < len(c.ranges); i++ {
+ r := c.ranges[i]
+ if r.first == r.last {
+ lower := unicode.ToLower(r.first)
+ c.ranges[i] = singleRange{first: lower, last: lower}
+ } else {
+ toAdd = append(toAdd, r)
+ }
+ }
+
+ for _, r := range toAdd {
+ c.addLowercaseRange(r.first, r.last)
+ }
+ c.canonicalize()
+}
+
+/**************************************************************************
+ Let U be the set of Unicode character values and let L be the lowercase
+ function, mapping from U to U. To perform case insensitive matching of
+ character sets, we need to be able to map an interval I in U, say
+
+ I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
+
+ to a set A such that A contains L(I) and A is contained in the union of
+ I and L(I).
+
+ The table below partitions U into intervals on which L is non-decreasing.
+ Thus, for any interval J = [a, b] contained in one of these intervals,
+ L(J) is contained in [L(a), L(b)].
+
+ It is also true that for any such J, [L(a), L(b)] is contained in the
+ union of J and L(J). This does not follow from L being non-decreasing on
+ these intervals. It follows from the nature of the L on each interval.
+ On each interval, L has one of the following forms:
+
+ (1) L(ch) = constant (LowercaseSet)
+ (2) L(ch) = ch + offset (LowercaseAdd)
+ (3) L(ch) = ch | 1 (LowercaseBor)
+ (4) L(ch) = ch + (ch & 1) (LowercaseBad)
+
+ It is easy to verify that for any of these forms [L(a), L(b)] is
+ contained in the union of [a, b] and L([a, b]).
+***************************************************************************/
+
+const (
+ LowercaseSet = 0 // Set to arg.
+ LowercaseAdd = 1 // Add arg.
+ LowercaseBor = 2 // Bitwise or with 1.
+ LowercaseBad = 3 // Bitwise and with 1 and add original.
+)
+
+type lcMap struct {
+ chMin, chMax rune
+ op, data int32
+}
+
+var lcTable = []lcMap{
+ lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
+ lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
+ lcMap{'\u0100', '\u012E', LowercaseBor, 0},
+ lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
+ lcMap{'\u0132', '\u0136', LowercaseBor, 0},
+ lcMap{'\u0139', '\u0147', LowercaseBad, 0},
+ lcMap{'\u014A', '\u0176', LowercaseBor, 0},
+ lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
+ lcMap{'\u0179', '\u017D', LowercaseBad, 0},
+ lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
+ lcMap{'\u0182', '\u0184', LowercaseBor, 0},
+ lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
+ lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
+ lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
+ lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
+ lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
+ lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
+ lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
+ lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
+ lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
+ lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
+ lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
+ lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
+ lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
+ lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
+ lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
+ lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
+ lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
+ lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
+ lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
+ lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
+ lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
+ lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
+ lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
+ lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
+ lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
+ lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
+ lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
+ lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
+ lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
+ lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
+ lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
+ lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
+ lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
+ lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
+ lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
+ lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
+ lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
+ lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
+ lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
+ lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
+ lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
+ lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
+ lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
+ lcMap{'\u0460', '\u0480', LowercaseBor, 0},
+ lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
+ lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
+ lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
+ lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
+ lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
+ lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
+ lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
+ lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
+ lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
+ lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
+ lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
+ lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
+ lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
+ lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
+ lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
+ lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
+ lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
+ lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
+ lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
+ lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
+ lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
+ lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
+ lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
+ lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
+ lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
+ lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
+ lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
+ lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
+ lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
+ lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
+ lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
+ lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
+ lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
+ lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
+ lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
+ lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
+ lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
+ lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
+ lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
+}
+
+func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
+ var i, iMax, iMid int
+ var chMinT, chMaxT rune
+ var lc lcMap
+
+ for i, iMax = 0, len(lcTable); i < iMax; {
+ iMid = (i + iMax) / 2
+ if lcTable[iMid].chMax < chMin {
+ i = iMid + 1
+ } else {
+ iMax = iMid
+ }
+ }
+
+ for ; i < len(lcTable); i++ {
+ lc = lcTable[i]
+ if lc.chMin > chMax {
+ return
+ }
+ chMinT = lc.chMin
+ if chMinT < chMin {
+ chMinT = chMin
+ }
+
+ chMaxT = lc.chMax
+ if chMaxT > chMax {
+ chMaxT = chMax
+ }
+
+ switch lc.op {
+ case LowercaseSet:
+ chMinT = rune(lc.data)
+ chMaxT = rune(lc.data)
+ break
+ case LowercaseAdd:
+ chMinT += lc.data
+ chMaxT += lc.data
+ break
+ case LowercaseBor:
+ chMinT |= 1
+ chMaxT |= 1
+ break
+ case LowercaseBad:
+ chMinT += (chMinT & 1)
+ chMaxT += (chMaxT & 1)
+ break
+ }
+
+ if chMinT < chMin || chMaxT > chMax {
+ c.addRange(chMinT, chMaxT)
+ }
+ }
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/code.go b/vendor/github.com/dlclark/regexp2/syntax/code.go
new file mode 100644
index 0000000..686e822
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/code.go
@@ -0,0 +1,274 @@
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+)
+
+// similar to prog.go in the go regex package...also with comment 'may not belong in this package'
+
+// File provides operator constants for use by the Builder and the Machine.
+
+// Implementation notes:
+//
+// Regexps are built into RegexCodes, which contain an operation array,
+// a string table, and some constants.
+//
+// Each operation is one of the codes below, followed by the integer
+// operands specified for each op.
+//
+// Strings and sets are indices into a string table.
+
+type InstOp int
+
+const (
+ // lef/back operands description
+
+ Onerep InstOp = 0 // lef,back char,min,max a {n}
+ Notonerep = 1 // lef,back char,min,max .{n}
+ Setrep = 2 // lef,back set,min,max [\d]{n}
+
+ Oneloop = 3 // lef,back char,min,max a {,n}
+ Notoneloop = 4 // lef,back char,min,max .{,n}
+ Setloop = 5 // lef,back set,min,max [\d]{,n}
+
+ Onelazy = 6 // lef,back char,min,max a {,n}?
+ Notonelazy = 7 // lef,back char,min,max .{,n}?
+ Setlazy = 8 // lef,back set,min,max [\d]{,n}?
+
+ One = 9 // lef char a
+ Notone = 10 // lef char [^a]
+ Set = 11 // lef set [a-z\s] \w \s \d
+
+ Multi = 12 // lef string abcd
+ Ref = 13 // lef group \#
+
+ Bol = 14 // ^
+ Eol = 15 // $
+ Boundary = 16 // \b
+ Nonboundary = 17 // \B
+ Beginning = 18 // \A
+ Start = 19 // \G
+ EndZ = 20 // \Z
+ End = 21 // \Z
+
+ Nothing = 22 // Reject!
+
+ // Primitive control structures
+
+ Lazybranch = 23 // back jump straight first
+ Branchmark = 24 // back jump branch first for loop
+ Lazybranchmark = 25 // back jump straight first for loop
+ Nullcount = 26 // back val set counter, null mark
+ Setcount = 27 // back val set counter, make mark
+ Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
+ Lazybranchcount = 29 // back jump,limit same, but straight first
+ Nullmark = 30 // back save position
+ Setmark = 31 // back save position
+ Capturemark = 32 // back group define group
+ Getmark = 33 // back recall position
+ Setjump = 34 // back save backtrack state
+ Backjump = 35 // zap back to saved state
+ Forejump = 36 // zap backtracking state
+ Testref = 37 // backtrack if ref undefined
+ Goto = 38 // jump just go
+
+ Prune = 39 // prune it baby
+ Stop = 40 // done!
+
+ ECMABoundary = 41 // \b
+ NonECMABoundary = 42 // \B
+
+ // Modifiers for alternate modes
+
+ Mask = 63 // Mask to get unmodified ordinary operator
+ Rtl = 64 // bit to indicate that we're reverse scanning.
+ Back = 128 // bit to indicate that we're backtracking.
+ Back2 = 256 // bit to indicate that we're backtracking on a second branch.
+ Ci = 512 // bit to indicate that we're case-insensitive.
+)
+
+type Code struct {
+ Codes []int // the code
+ Strings [][]rune // string table
+ Sets []*CharSet //character set table
+ TrackCount int // how many instructions use backtracking
+ Caps map[int]int // mapping of user group numbers -> impl group slots
+ Capsize int // number of impl group slots
+ FcPrefix *Prefix // the set of candidate first characters (may be null)
+ BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
+ Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
+ RightToLeft bool // true if right to left
+}
+
+func opcodeBacktracks(op InstOp) bool {
+ op &= Mask
+
+ switch op {
+ case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
+ Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
+ Forejump, Goto:
+ return true
+
+ default:
+ return false
+ }
+}
+
+func opcodeSize(op InstOp) int {
+ op &= Mask
+
+ switch op {
+ case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
+ End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
+ return 1
+
+ case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
+ Prune, Set:
+ return 2
+
+ case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
+ Setlazy, Setrep, Setloop:
+ return 3
+
+ default:
+ panic(fmt.Errorf("Unexpected op code: %v", op))
+ }
+}
+
+var codeStr = []string{
+ "Onerep", "Notonerep", "Setrep",
+ "Oneloop", "Notoneloop", "Setloop",
+ "Onelazy", "Notonelazy", "Setlazy",
+ "One", "Notone", "Set",
+ "Multi", "Ref",
+ "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
+ "Nothing",
+ "Lazybranch", "Branchmark", "Lazybranchmark",
+ "Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
+ "Nullmark", "Setmark", "Capturemark", "Getmark",
+ "Setjump", "Backjump", "Forejump", "Testref", "Goto",
+ "Prune", "Stop",
+ "ECMABoundary", "NonECMABoundary",
+}
+
+func operatorDescription(op InstOp) string {
+ desc := codeStr[op&Mask]
+ if (op & Ci) != 0 {
+ desc += "-Ci"
+ }
+ if (op & Rtl) != 0 {
+ desc += "-Rtl"
+ }
+ if (op & Back) != 0 {
+ desc += "-Back"
+ }
+ if (op & Back2) != 0 {
+ desc += "-Back2"
+ }
+
+ return desc
+}
+
+// OpcodeDescription is a humman readable string of the specific offset
+func (c *Code) OpcodeDescription(offset int) string {
+ buf := &bytes.Buffer{}
+
+ op := InstOp(c.Codes[offset])
+ fmt.Fprintf(buf, "%06d ", offset)
+
+ if opcodeBacktracks(op & Mask) {
+ buf.WriteString("*")
+ } else {
+ buf.WriteString(" ")
+ }
+ buf.WriteString(operatorDescription(op))
+ buf.WriteString("(")
+ op &= Mask
+
+ switch op {
+ case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
+ buf.WriteString("Ch = ")
+ buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
+
+ case Set, Setrep, Setloop, Setlazy:
+ buf.WriteString("Set = ")
+ buf.WriteString(c.Sets[c.Codes[offset+1]].String())
+
+ case Multi:
+ fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
+
+ case Ref, Testref:
+ fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
+
+ case Capturemark:
+ fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
+ if c.Codes[offset+2] != -1 {
+ fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
+ }
+
+ case Nullcount, Setcount:
+ fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
+
+ case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
+ fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
+ }
+
+ switch op {
+ case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
+ buf.WriteString(", Rep = ")
+ if c.Codes[offset+2] == math.MaxInt32 {
+ buf.WriteString("inf")
+ } else {
+ fmt.Fprintf(buf, "%d", c.Codes[offset+2])
+ }
+
+ case Branchcount, Lazybranchcount:
+ buf.WriteString(", Limit = ")
+ if c.Codes[offset+2] == math.MaxInt32 {
+ buf.WriteString("inf")
+ } else {
+ fmt.Fprintf(buf, "%d", c.Codes[offset+2])
+ }
+
+ }
+
+ buf.WriteString(")")
+
+ return buf.String()
+}
+
+func (c *Code) Dump() string {
+ buf := &bytes.Buffer{}
+
+ if c.RightToLeft {
+ fmt.Fprintln(buf, "Direction: right-to-left")
+ } else {
+ fmt.Fprintln(buf, "Direction: left-to-right")
+ }
+ if c.FcPrefix == nil {
+ fmt.Fprintln(buf, "Firstchars: n/a")
+ } else {
+ fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
+ }
+
+ if c.BmPrefix == nil {
+ fmt.Fprintln(buf, "Prefix: n/a")
+ } else {
+ fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
+ }
+
+ fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
+ fmt.Fprintln(buf)
+
+ if c.BmPrefix != nil {
+ fmt.Fprintln(buf, "BoyerMoore:")
+ fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
+ }
+ for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
+ fmt.Fprintln(buf, c.OpcodeDescription(i))
+ }
+
+ return buf.String()
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/escape.go b/vendor/github.com/dlclark/regexp2/syntax/escape.go
new file mode 100644
index 0000000..609df10
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/escape.go
@@ -0,0 +1,94 @@
+package syntax
+
+import (
+ "bytes"
+ "strconv"
+ "strings"
+ "unicode"
+)
+
+func Escape(input string) string {
+ b := &bytes.Buffer{}
+ for _, r := range input {
+ escape(b, r, false)
+ }
+ return b.String()
+}
+
+const meta = `\.+*?()|[]{}^$# `
+
+func escape(b *bytes.Buffer, r rune, force bool) {
+ if unicode.IsPrint(r) {
+ if strings.IndexRune(meta, r) >= 0 || force {
+ b.WriteRune('\\')
+ }
+ b.WriteRune(r)
+ return
+ }
+
+ switch r {
+ case '\a':
+ b.WriteString(`\a`)
+ case '\f':
+ b.WriteString(`\f`)
+ case '\n':
+ b.WriteString(`\n`)
+ case '\r':
+ b.WriteString(`\r`)
+ case '\t':
+ b.WriteString(`\t`)
+ case '\v':
+ b.WriteString(`\v`)
+ default:
+ if r < 0x100 {
+ b.WriteString(`\x`)
+ s := strconv.FormatInt(int64(r), 16)
+ if len(s) == 1 {
+ b.WriteRune('0')
+ }
+ b.WriteString(s)
+ break
+ }
+ b.WriteString(`\u`)
+ b.WriteString(strconv.FormatInt(int64(r), 16))
+ }
+}
+
+func Unescape(input string) (string, error) {
+ idx := strings.IndexRune(input, '\\')
+ // no slashes means no unescape needed
+ if idx == -1 {
+ return input, nil
+ }
+
+ buf := bytes.NewBufferString(input[:idx])
+ // get the runes for the rest of the string -- we're going full parser scan on this
+
+ p := parser{}
+ p.setPattern(input[idx+1:])
+ for {
+ if p.rightMost() {
+ return "", p.getErr(ErrIllegalEndEscape)
+ }
+ r, err := p.scanCharEscape()
+ if err != nil {
+ return "", err
+ }
+ buf.WriteRune(r)
+ // are we done?
+ if p.rightMost() {
+ return buf.String(), nil
+ }
+
+ r = p.moveRightGetChar()
+ for r != '\\' {
+ buf.WriteRune(r)
+ if p.rightMost() {
+ // we're done, no more slashes
+ return buf.String(), nil
+ }
+ // keep scanning until we get another slash
+ r = p.moveRightGetChar()
+ }
+ }
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/fuzz.go b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go
new file mode 100644
index 0000000..ee86386
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go
@@ -0,0 +1,20 @@
+// +build gofuzz
+
+package syntax
+
+// Fuzz is the input point for go-fuzz
+func Fuzz(data []byte) int {
+ sdata := string(data)
+ tree, err := Parse(sdata, RegexOptions(0))
+ if err != nil {
+ return 0
+ }
+
+ // translate it to code
+ _, err = Write(tree)
+ if err != nil {
+ panic(err)
+ }
+
+ return 1
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/parser.go b/vendor/github.com/dlclark/regexp2/syntax/parser.go
new file mode 100644
index 0000000..9dc6e31
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/parser.go
@@ -0,0 +1,2251 @@
+package syntax
+
+import (
+ "fmt"
+ "math"
+ "os"
+ "sort"
+ "strconv"
+ "unicode"
+)
+
+type RegexOptions int32
+
+const (
+ IgnoreCase RegexOptions = 0x0001 // "i"
+ Multiline = 0x0002 // "m"
+ ExplicitCapture = 0x0004 // "n"
+ Compiled = 0x0008 // "c"
+ Singleline = 0x0010 // "s"
+ IgnorePatternWhitespace = 0x0020 // "x"
+ RightToLeft = 0x0040 // "r"
+ Debug = 0x0080 // "d"
+ ECMAScript = 0x0100 // "e"
+ RE2 = 0x0200 // RE2 compat mode
+ Unicode = 0x0400 // "u"
+)
+
+func optionFromCode(ch rune) RegexOptions {
+ // case-insensitive
+ switch ch {
+ case 'i', 'I':
+ return IgnoreCase
+ case 'r', 'R':
+ return RightToLeft
+ case 'm', 'M':
+ return Multiline
+ case 'n', 'N':
+ return ExplicitCapture
+ case 's', 'S':
+ return Singleline
+ case 'x', 'X':
+ return IgnorePatternWhitespace
+ case 'd', 'D':
+ return Debug
+ case 'e', 'E':
+ return ECMAScript
+ case 'u', 'U':
+ return Unicode
+ default:
+ return 0
+ }
+}
+
+// An Error describes a failure to parse a regular expression
+// and gives the offending expression.
+type Error struct {
+ Code ErrorCode
+ Expr string
+ Args []interface{}
+}
+
+func (e *Error) Error() string {
+ if len(e.Args) == 0 {
+ return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
+ }
+ return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
+}
+
+// An ErrorCode describes a failure to parse a regular expression.
+type ErrorCode string
+
+const (
+ // internal issue
+ ErrInternalError ErrorCode = "regexp/syntax: internal error"
+ // Parser errors
+ ErrUnterminatedComment = "unterminated comment"
+ ErrInvalidCharRange = "invalid character class range"
+ ErrInvalidRepeatSize = "invalid repeat count"
+ ErrInvalidUTF8 = "invalid UTF-8"
+ ErrCaptureGroupOutOfRange = "capture group number out of range"
+ ErrUnexpectedParen = "unexpected )"
+ ErrMissingParen = "missing closing )"
+ ErrMissingBrace = "missing closing }"
+ ErrInvalidRepeatOp = "invalid nested repetition operator"
+ ErrMissingRepeatArgument = "missing argument to repetition operator"
+ ErrConditionalExpression = "illegal conditional (?(...)) expression"
+ ErrTooManyAlternates = "too many | in (?()|)"
+ ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
+ ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
+ ErrCapNumNotZero = "capture number cannot be zero"
+ ErrUndefinedBackRef = "reference to undefined group number %v"
+ ErrUndefinedNameRef = "reference to undefined group name %v"
+ ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
+ ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
+ ErrMalformedReference = "(?(%v) ) malformed"
+ ErrUndefinedReference = "(?(%v) ) reference to undefined group"
+ ErrIllegalEndEscape = "illegal \\ at end of pattern"
+ ErrMalformedSlashP = "malformed \\p{X} character escape"
+ ErrIncompleteSlashP = "incomplete \\p{X} character escape"
+ ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
+ ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
+ ErrMissingControl = "missing control character"
+ ErrUnrecognizedControl = "unrecognized control character"
+ ErrTooFewHex = "insufficient hexadecimal digits"
+ ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
+ ErrMalformedNameRef = "malformed \\k<...> named back reference"
+ ErrBadClassInCharRange = "cannot include class \\%v in character range"
+ ErrUnterminatedBracket = "unterminated [] set"
+ ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
+ ErrReversedCharRange = "[%c-%c] range in reverse order"
+)
+
+func (e ErrorCode) String() string {
+ return string(e)
+}
+
+type parser struct {
+ stack *regexNode
+ group *regexNode
+ alternation *regexNode
+ concatenation *regexNode
+ unit *regexNode
+
+ patternRaw string
+ pattern []rune
+
+ currentPos int
+ specialCase *unicode.SpecialCase
+
+ autocap int
+ capcount int
+ captop int
+ capsize int
+
+ caps map[int]int
+ capnames map[string]int
+
+ capnumlist []int
+ capnamelist []string
+
+ options RegexOptions
+ optionsStack []RegexOptions
+ ignoreNextParen bool
+}
+
+const (
+ maxValueDiv10 int = math.MaxInt32 / 10
+ maxValueMod10 = math.MaxInt32 % 10
+)
+
+// Parse converts a regex string into a parse tree
+func Parse(re string, op RegexOptions) (*RegexTree, error) {
+ p := parser{
+ options: op,
+ caps: make(map[int]int),
+ }
+ p.setPattern(re)
+
+ if err := p.countCaptures(); err != nil {
+ return nil, err
+ }
+
+ p.reset(op)
+ root, err := p.scanRegex()
+
+ if err != nil {
+ return nil, err
+ }
+ tree := &RegexTree{
+ root: root,
+ caps: p.caps,
+ capnumlist: p.capnumlist,
+ captop: p.captop,
+ Capnames: p.capnames,
+ Caplist: p.capnamelist,
+ options: op,
+ }
+
+ if tree.options&Debug > 0 {
+ os.Stdout.WriteString(tree.Dump())
+ }
+
+ return tree, nil
+}
+
+func (p *parser) setPattern(pattern string) {
+ p.patternRaw = pattern
+ p.pattern = make([]rune, 0, len(pattern))
+
+ //populate our rune array to handle utf8 encoding
+ for _, r := range pattern {
+ p.pattern = append(p.pattern, r)
+ }
+}
+func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
+ return &Error{Code: code, Expr: p.patternRaw, Args: args}
+}
+
+func (p *parser) noteCaptureSlot(i, pos int) {
+ if _, ok := p.caps[i]; !ok {
+ // the rhs of the hashtable isn't used in the parser
+ p.caps[i] = pos
+ p.capcount++
+
+ if p.captop <= i {
+ if i == math.MaxInt32 {
+ p.captop = i
+ } else {
+ p.captop = i + 1
+ }
+ }
+ }
+}
+
+func (p *parser) noteCaptureName(name string, pos int) {
+ if p.capnames == nil {
+ p.capnames = make(map[string]int)
+ }
+
+ if _, ok := p.capnames[name]; !ok {
+ p.capnames[name] = pos
+ p.capnamelist = append(p.capnamelist, name)
+ }
+}
+
+func (p *parser) assignNameSlots() {
+ if p.capnames != nil {
+ for _, name := range p.capnamelist {
+ for p.isCaptureSlot(p.autocap) {
+ p.autocap++
+ }
+ pos := p.capnames[name]
+ p.capnames[name] = p.autocap
+ p.noteCaptureSlot(p.autocap, pos)
+
+ p.autocap++
+ }
+ }
+
+ // if the caps array has at least one gap, construct the list of used slots
+ if p.capcount < p.captop {
+ p.capnumlist = make([]int, p.capcount)
+ i := 0
+
+ for k := range p.caps {
+ p.capnumlist[i] = k
+ i++
+ }
+
+ sort.Ints(p.capnumlist)
+ }
+
+ // merge capsnumlist into capnamelist
+ if p.capnames != nil || p.capnumlist != nil {
+ var oldcapnamelist []string
+ var next int
+ var k int
+
+ if p.capnames == nil {
+ oldcapnamelist = nil
+ p.capnames = make(map[string]int)
+ p.capnamelist = []string{}
+ next = -1
+ } else {
+ oldcapnamelist = p.capnamelist
+ p.capnamelist = []string{}
+ next = p.capnames[oldcapnamelist[0]]
+ }
+
+ for i := 0; i < p.capcount; i++ {
+ j := i
+ if p.capnumlist != nil {
+ j = p.capnumlist[i]
+ }
+
+ if next == j {
+ p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
+ k++
+
+ if k == len(oldcapnamelist) {
+ next = -1
+ } else {
+ next = p.capnames[oldcapnamelist[k]]
+ }
+
+ } else {
+ //feature: culture?
+ str := strconv.Itoa(j)
+ p.capnamelist = append(p.capnamelist, str)
+ p.capnames[str] = j
+ }
+ }
+ }
+}
+
+func (p *parser) consumeAutocap() int {
+ r := p.autocap
+ p.autocap++
+ return r
+}
+
+// CountCaptures is a prescanner for deducing the slots used for
+// captures by doing a partial tokenization of the pattern.
+func (p *parser) countCaptures() error {
+ var ch rune
+
+ p.noteCaptureSlot(0, 0)
+
+ p.autocap = 1
+
+ for p.charsRight() > 0 {
+ pos := p.textpos()
+ ch = p.moveRightGetChar()
+ switch ch {
+ case '\\':
+ if p.charsRight() > 0 {
+ p.scanBackslash(true)
+ }
+
+ case '#':
+ if p.useOptionX() {
+ p.moveLeft()
+ p.scanBlank()
+ }
+
+ case '[':
+ p.scanCharSet(false, true)
+
+ case ')':
+ if !p.emptyOptionsStack() {
+ p.popOptions()
+ }
+
+ case '(':
+ if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
+ p.moveLeft()
+ p.scanBlank()
+ } else {
+ p.pushOptions()
+ if p.charsRight() > 0 && p.rightChar(0) == '?' {
+ // we have (?...
+ p.moveRight(1)
+
+ if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
+ // named group: (?<... or (?'...
+
+ p.moveRight(1)
+ ch = p.rightChar(0)
+
+ if ch != '0' && IsWordChar(ch) {
+ if ch >= '1' && ch <= '9' {
+ dec, err := p.scanDecimal()
+ if err != nil {
+ return err
+ }
+ p.noteCaptureSlot(dec, pos)
+ } else {
+ p.noteCaptureName(p.scanCapname(), pos)
+ }
+ }
+ } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
+ // RE2-compat (?P<)
+ p.moveRight(2)
+ ch = p.rightChar(0)
+ if IsWordChar(ch) {
+ p.noteCaptureName(p.scanCapname(), pos)
+ }
+
+ } else {
+ // (?...
+
+ // get the options if it's an option construct (?cimsx-cimsx...)
+ p.scanOptions()
+
+ if p.charsRight() > 0 {
+ if p.rightChar(0) == ')' {
+ // (?cimsx-cimsx)
+ p.moveRight(1)
+ p.popKeepOptions()
+ } else if p.rightChar(0) == '(' {
+ // alternation construct: (?(foo)yes|no)
+ // ignore the next paren so we don't capture the condition
+ p.ignoreNextParen = true
+
+ // break from here so we don't reset ignoreNextParen
+ continue
+ }
+ }
+ }
+ } else {
+ if !p.useOptionN() && !p.ignoreNextParen {
+ p.noteCaptureSlot(p.consumeAutocap(), pos)
+ }
+ }
+ }
+
+ p.ignoreNextParen = false
+
+ }
+ }
+
+ p.assignNameSlots()
+ return nil
+}
+
+func (p *parser) reset(topopts RegexOptions) {
+ p.currentPos = 0
+ p.autocap = 1
+ p.ignoreNextParen = false
+
+ if len(p.optionsStack) > 0 {
+ p.optionsStack = p.optionsStack[:0]
+ }
+
+ p.options = topopts
+ p.stack = nil
+}
+
+func (p *parser) scanRegex() (*regexNode, error) {
+ ch := '@' // nonspecial ch, means at beginning
+ isQuant := false
+
+ p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
+
+ for p.charsRight() > 0 {
+ wasPrevQuantifier := isQuant
+ isQuant = false
+
+ if err := p.scanBlank(); err != nil {
+ return nil, err
+ }
+
+ startpos := p.textpos()
+
+ // move past all of the normal characters. We'll stop when we hit some kind of control character,
+ // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
+ if p.useOptionX() {
+ for p.charsRight() > 0 {
+ ch = p.rightChar(0)
+ //UGLY: clean up, this is ugly
+ if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
+ break
+ }
+ p.moveRight(1)
+ }
+ } else {
+ for p.charsRight() > 0 {
+ ch = p.rightChar(0)
+ if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
+ break
+ }
+ p.moveRight(1)
+ }
+ }
+
+ endpos := p.textpos()
+
+ p.scanBlank()
+
+ if p.charsRight() == 0 {
+ ch = '!' // nonspecial, means at end
+ } else if ch = p.rightChar(0); isSpecial(ch) {
+ isQuant = isQuantifier(ch)
+ p.moveRight(1)
+ } else {
+ ch = ' ' // nonspecial, means at ordinary char
+ }
+
+ if startpos < endpos {
+ cchUnquantified := endpos - startpos
+ if isQuant {
+ cchUnquantified--
+ }
+ wasPrevQuantifier = false
+
+ if cchUnquantified > 0 {
+ p.addToConcatenate(startpos, cchUnquantified, false)
+ }
+
+ if isQuant {
+ p.addUnitOne(p.charAt(endpos - 1))
+ }
+ }
+
+ switch ch {
+ case '!':
+ goto BreakOuterScan
+
+ case ' ':
+ goto ContinueOuterScan
+
+ case '[':
+ cc, err := p.scanCharSet(p.useOptionI(), false)
+ if err != nil {
+ return nil, err
+ }
+ p.addUnitSet(cc)
+
+ case '(':
+ p.pushOptions()
+
+ if grouper, err := p.scanGroupOpen(); err != nil {
+ return nil, err
+ } else if grouper == nil {
+ p.popKeepOptions()
+ } else {
+ p.pushGroup()
+ p.startGroup(grouper)
+ }
+
+ continue
+
+ case '|':
+ p.addAlternate()
+ goto ContinueOuterScan
+
+ case ')':
+ if p.emptyStack() {
+ return nil, p.getErr(ErrUnexpectedParen)
+ }
+
+ if err := p.addGroup(); err != nil {
+ return nil, err
+ }
+ if err := p.popGroup(); err != nil {
+ return nil, err
+ }
+ p.popOptions()
+
+ if p.unit == nil {
+ goto ContinueOuterScan
+ }
+
+ case '\\':
+ n, err := p.scanBackslash(false)
+ if err != nil {
+ return nil, err
+ }
+ p.addUnitNode(n)
+
+ case '^':
+ if p.useOptionM() {
+ p.addUnitType(ntBol)
+ } else {
+ p.addUnitType(ntBeginning)
+ }
+
+ case '$':
+ if p.useOptionM() {
+ p.addUnitType(ntEol)
+ } else {
+ p.addUnitType(ntEndZ)
+ }
+
+ case '.':
+ if p.useOptionE() {
+ p.addUnitSet(ECMAAnyClass())
+ } else if p.useOptionS() {
+ p.addUnitSet(AnyClass())
+ } else {
+ p.addUnitNotone('\n')
+ }
+
+ case '{', '*', '+', '?':
+ if p.unit == nil {
+ if wasPrevQuantifier {
+ return nil, p.getErr(ErrInvalidRepeatOp)
+ } else {
+ return nil, p.getErr(ErrMissingRepeatArgument)
+ }
+ }
+ p.moveLeft()
+
+ default:
+ return nil, p.getErr(ErrInternalError)
+ }
+
+ if err := p.scanBlank(); err != nil {
+ return nil, err
+ }
+
+ if p.charsRight() > 0 {
+ isQuant = p.isTrueQuantifier()
+ }
+ if p.charsRight() == 0 || !isQuant {
+ //maintain odd C# assignment order -- not sure if required, could clean up?
+ p.addConcatenate()
+ goto ContinueOuterScan
+ }
+
+ ch = p.moveRightGetChar()
+
+ // Handle quantifiers
+ for p.unit != nil {
+ var min, max int
+ var lazy bool
+
+ switch ch {
+ case '*':
+ min = 0
+ max = math.MaxInt32
+
+ case '?':
+ min = 0
+ max = 1
+
+ case '+':
+ min = 1
+ max = math.MaxInt32
+
+ case '{':
+ {
+ var err error
+ startpos = p.textpos()
+ if min, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+ max = min
+ if startpos < p.textpos() {
+ if p.charsRight() > 0 && p.rightChar(0) == ',' {
+ p.moveRight(1)
+ if p.charsRight() == 0 || p.rightChar(0) == '}' {
+ max = math.MaxInt32
+ } else {
+ if max, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+ }
+ }
+ }
+
+ if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
+ p.addConcatenate()
+ p.textto(startpos - 1)
+ goto ContinueOuterScan
+ }
+ }
+
+ default:
+ return nil, p.getErr(ErrInternalError)
+ }
+
+ if err := p.scanBlank(); err != nil {
+ return nil, err
+ }
+
+ if p.charsRight() == 0 || p.rightChar(0) != '?' {
+ lazy = false
+ } else {
+ p.moveRight(1)
+ lazy = true
+ }
+
+ if min > max {
+ return nil, p.getErr(ErrInvalidRepeatSize)
+ }
+
+ p.addConcatenate3(lazy, min, max)
+ }
+
+ ContinueOuterScan:
+ }
+
+BreakOuterScan:
+ ;
+
+ if !p.emptyStack() {
+ return nil, p.getErr(ErrMissingParen)
+ }
+
+ if err := p.addGroup(); err != nil {
+ return nil, err
+ }
+
+ return p.unit, nil
+
+}
+
+/*
+ * Simple parsing for replacement patterns
+ */
+func (p *parser) scanReplacement() (*regexNode, error) {
+ var c, startpos int
+
+ p.concatenation = newRegexNode(ntConcatenate, p.options)
+
+ for {
+ c = p.charsRight()
+ if c == 0 {
+ break
+ }
+
+ startpos = p.textpos()
+
+ for c > 0 && p.rightChar(0) != '$' {
+ p.moveRight(1)
+ c--
+ }
+
+ p.addToConcatenate(startpos, p.textpos()-startpos, true)
+
+ if c > 0 {
+ if p.moveRightGetChar() == '$' {
+ n, err := p.scanDollar()
+ if err != nil {
+ return nil, err
+ }
+ p.addUnitNode(n)
+ }
+ p.addConcatenate()
+ }
+ }
+
+ return p.concatenation, nil
+}
+
+/*
+ * Scans $ patterns recognized within replacement patterns
+ */
+func (p *parser) scanDollar() (*regexNode, error) {
+ if p.charsRight() == 0 {
+ return newRegexNodeCh(ntOne, p.options, '$'), nil
+ }
+
+ ch := p.rightChar(0)
+ angled := false
+ backpos := p.textpos()
+ lastEndPos := backpos
+
+ // Note angle
+
+ if ch == '{' && p.charsRight() > 1 {
+ angled = true
+ p.moveRight(1)
+ ch = p.rightChar(0)
+ }
+
+ // Try to parse backreference: \1 or \{1} or \{cap}
+
+ if ch >= '0' && ch <= '9' {
+ if !angled && p.useOptionE() {
+ capnum := -1
+ newcapnum := int(ch - '0')
+ p.moveRight(1)
+ if p.isCaptureSlot(newcapnum) {
+ capnum = newcapnum
+ lastEndPos = p.textpos()
+ }
+
+ for p.charsRight() > 0 {
+ ch = p.rightChar(0)
+ if ch < '0' || ch > '9' {
+ break
+ }
+ digit := int(ch - '0')
+ if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
+ return nil, p.getErr(ErrCaptureGroupOutOfRange)
+ }
+
+ newcapnum = newcapnum*10 + digit
+
+ p.moveRight(1)
+ if p.isCaptureSlot(newcapnum) {
+ capnum = newcapnum
+ lastEndPos = p.textpos()
+ }
+ }
+ p.textto(lastEndPos)
+ if capnum >= 0 {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ } else {
+ capnum, err := p.scanDecimal()
+ if err != nil {
+ return nil, err
+ }
+ if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ }
+ }
+ } else if angled && IsWordChar(ch) {
+ capname := p.scanCapname()
+
+ if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
+ if p.isCaptureName(capname) {
+ return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
+ }
+ }
+ } else if !angled {
+ capnum := 1
+
+ switch ch {
+ case '$':
+ p.moveRight(1)
+ return newRegexNodeCh(ntOne, p.options, '$'), nil
+ case '&':
+ capnum = 0
+ case '`':
+ capnum = replaceLeftPortion
+ case '\'':
+ capnum = replaceRightPortion
+ case '+':
+ capnum = replaceLastGroup
+ case '_':
+ capnum = replaceWholeString
+ }
+
+ if capnum != 1 {
+ p.moveRight(1)
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ }
+
+ // unrecognized $: literalize
+
+ p.textto(backpos)
+ return newRegexNodeCh(ntOne, p.options, '$'), nil
+}
+
+// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
+// a RegexNode for the type of group scanned, or nil if the group
+// simply changed options (?cimsx-cimsx) or was a comment (#...).
+func (p *parser) scanGroupOpen() (*regexNode, error) {
+ var ch rune
+ var nt nodeType
+ var err error
+ close := '>'
+ start := p.textpos()
+
+ // just return a RegexNode if we have:
+ // 1. "(" followed by nothing
+ // 2. "(x" where x != ?
+ // 3. "(?)"
+ if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
+ if p.useOptionN() || p.ignoreNextParen {
+ p.ignoreNextParen = false
+ return newRegexNode(ntGroup, p.options), nil
+ }
+ return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
+ }
+
+ p.moveRight(1)
+
+ for {
+ if p.charsRight() == 0 {
+ break
+ }
+
+ switch ch = p.moveRightGetChar(); ch {
+ case ':':
+ nt = ntGroup
+
+ case '=':
+ p.options &= ^RightToLeft
+ nt = ntRequire
+
+ case '!':
+ p.options &= ^RightToLeft
+ nt = ntPrevent
+
+ case '>':
+ nt = ntGreedy
+
+ case '\'':
+ close = '\''
+ fallthrough
+
+ case '<':
+ if p.charsRight() == 0 {
+ goto BreakRecognize
+ }
+
+ switch ch = p.moveRightGetChar(); ch {
+ case '=':
+ if close == '\'' {
+ goto BreakRecognize
+ }
+
+ p.options |= RightToLeft
+ nt = ntRequire
+
+ case '!':
+ if close == '\'' {
+ goto BreakRecognize
+ }
+
+ p.options |= RightToLeft
+ nt = ntPrevent
+
+ default:
+ p.moveLeft()
+ capnum := -1
+ uncapnum := -1
+ proceed := false
+
+ // grab part before -
+
+ if ch >= '0' && ch <= '9' {
+ if capnum, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+
+ if !p.isCaptureSlot(capnum) {
+ capnum = -1
+ }
+
+ // check if we have bogus characters after the number
+ if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ if capnum == 0 {
+ return nil, p.getErr(ErrCapNumNotZero)
+ }
+ } else if IsWordChar(ch) {
+ capname := p.scanCapname()
+
+ if p.isCaptureName(capname) {
+ capnum = p.captureSlotFromName(capname)
+ }
+
+ // check if we have bogus character after the name
+ if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ } else if ch == '-' {
+ proceed = true
+ } else {
+ // bad group name - starts with something other than a word character and isn't a number
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+
+ // grab part after - if any
+
+ if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
+ p.moveRight(1)
+
+ //no more chars left, no closing char, etc
+ if p.charsRight() == 0 {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+
+ ch = p.rightChar(0)
+ if ch >= '0' && ch <= '9' {
+ if uncapnum, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+
+ if !p.isCaptureSlot(uncapnum) {
+ return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
+ }
+
+ // check if we have bogus characters after the number
+ if p.charsRight() > 0 && p.rightChar(0) != close {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ } else if IsWordChar(ch) {
+ uncapname := p.scanCapname()
+
+ if !p.isCaptureName(uncapname) {
+ return nil, p.getErr(ErrUndefinedNameRef, uncapname)
+ }
+ uncapnum = p.captureSlotFromName(uncapname)
+
+ // check if we have bogus character after the name
+ if p.charsRight() > 0 && p.rightChar(0) != close {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ } else {
+ // bad group name - starts with something other than a word character and isn't a number
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ }
+
+ // actually make the node
+
+ if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
+ return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
+ }
+ goto BreakRecognize
+ }
+
+ case '(':
+ // alternation construct (?(...) | )
+
+ parenPos := p.textpos()
+ if p.charsRight() > 0 {
+ ch = p.rightChar(0)
+
+ // check if the alternation condition is a backref
+ if ch >= '0' && ch <= '9' {
+ var capnum int
+ if capnum, err = p.scanDecimal(); err != nil {
+ return nil, err
+ }
+ if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntTestref, p.options, capnum), nil
+ }
+ return nil, p.getErr(ErrUndefinedReference, capnum)
+ }
+
+ return nil, p.getErr(ErrMalformedReference, capnum)
+
+ } else if IsWordChar(ch) {
+ capname := p.scanCapname()
+
+ if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
+ return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
+ }
+ }
+ }
+ // not a backref
+ nt = ntTestgroup
+ p.textto(parenPos - 1) // jump to the start of the parentheses
+ p.ignoreNextParen = true // but make sure we don't try to capture the insides
+
+ charsRight := p.charsRight()
+ if charsRight >= 3 && p.rightChar(1) == '?' {
+ rightchar2 := p.rightChar(2)
+ // disallow comments in the condition
+ if rightchar2 == '#' {
+ return nil, p.getErr(ErrAlternationCantHaveComment)
+ }
+
+ // disallow named capture group (?<..>..) in the condition
+ if rightchar2 == '\'' {
+ return nil, p.getErr(ErrAlternationCantCapture)
+ }
+
+ if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
+ return nil, p.getErr(ErrAlternationCantCapture)
+ }
+ }
+
+ case 'P':
+ if p.useRE2() {
+ // support for P<name> syntax
+ if p.charsRight() < 3 {
+ goto BreakRecognize
+ }
+
+ ch = p.moveRightGetChar()
+ if ch != '<' {
+ goto BreakRecognize
+ }
+
+ ch = p.moveRightGetChar()
+ p.moveLeft()
+
+ if IsWordChar(ch) {
+ capnum := -1
+ capname := p.scanCapname()
+
+ if p.isCaptureName(capname) {
+ capnum = p.captureSlotFromName(capname)
+ }
+
+ // check if we have bogus character after the name
+ if p.charsRight() > 0 && p.rightChar(0) != '>' {
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+
+ // actually make the node
+
+ if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
+ return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
+ }
+ goto BreakRecognize
+
+ } else {
+ // bad group name - starts with something other than a word character and isn't a number
+ return nil, p.getErr(ErrInvalidGroupName)
+ }
+ }
+ // if we're not using RE2 compat mode then
+ // we just behave like normal
+ fallthrough
+
+ default:
+ p.moveLeft()
+
+ nt = ntGroup
+ // disallow options in the children of a testgroup node
+ if p.group.t != ntTestgroup {
+ p.scanOptions()
+ }
+ if p.charsRight() == 0 {
+ goto BreakRecognize
+ }
+
+ if ch = p.moveRightGetChar(); ch == ')' {
+ return nil, nil
+ }
+
+ if ch != ':' {
+ goto BreakRecognize
+ }
+
+ }
+
+ return newRegexNode(nt, p.options), nil
+ }
+
+BreakRecognize:
+
+ // break Recognize comes here
+
+ return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
+}
+
+// scans backslash specials and basics
+func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
+
+ if p.charsRight() == 0 {
+ return nil, p.getErr(ErrIllegalEndEscape)
+ }
+
+ switch ch := p.rightChar(0); ch {
+ case 'b', 'B', 'A', 'G', 'Z', 'z':
+ p.moveRight(1)
+ return newRegexNode(p.typeFromCode(ch), p.options), nil
+
+ case 'w':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, WordClass()), nil
+
+ case 'W':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
+
+ case 's':
+ p.moveRight(1)
+ if p.useOptionE() {
+ return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
+ } else if p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
+
+ case 'S':
+ p.moveRight(1)
+ if p.useOptionE() {
+ return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
+ } else if p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
+
+ case 'd':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
+
+ case 'D':
+ p.moveRight(1)
+ if p.useOptionE() || p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
+ }
+ return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
+
+ case 'p', 'P':
+ p.moveRight(1)
+ prop, err := p.parseProperty()
+ if err != nil {
+ return nil, err
+ }
+ cc := &CharSet{}
+ cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
+ if p.useOptionI() {
+ cc.addLowercase()
+ }
+
+ return newRegexNodeSet(ntSet, p.options, cc), nil
+
+ default:
+ return p.scanBasicBackslash(scanOnly)
+ }
+}
+
+// Scans \-style backreferences and character escapes
+func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
+ if p.charsRight() == 0 {
+ return nil, p.getErr(ErrIllegalEndEscape)
+ }
+ angled := false
+ k := false
+ close := '\x00'
+
+ backpos := p.textpos()
+ ch := p.rightChar(0)
+
+ // Allow \k<foo> instead of \<foo>, which is now deprecated.
+
+ // According to ECMAScript specification, \k<name> is only parsed as a named group reference if
+ // there is at least one group name in the regexp.
+ // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
+ // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
+ if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
+ if p.charsRight() >= 2 {
+ p.moveRight(1)
+ ch = p.moveRightGetChar()
+
+ if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
+ angled = true
+ if ch == '\'' {
+ close = '\''
+ } else {
+ close = '>'
+ }
+ }
+ }
+
+ if !angled || p.charsRight() <= 0 {
+ return nil, p.getErr(ErrMalformedNameRef)
+ }
+
+ ch = p.rightChar(0)
+ k = true
+
+ } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
+ angled = true
+ if ch == '\'' {
+ close = '\''
+ } else {
+ close = '>'
+ }
+
+ p.moveRight(1)
+ ch = p.rightChar(0)
+ }
+
+ // Try to parse backreference: \<1> or \<cap>
+
+ if angled && ch >= '0' && ch <= '9' {
+ capnum, err := p.scanDecimal()
+ if err != nil {
+ return nil, err
+ }
+
+ if p.charsRight() > 0 && p.moveRightGetChar() == close {
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ return nil, p.getErr(ErrUndefinedBackRef, capnum)
+ }
+ } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
+ capnum, err := p.scanDecimal()
+ if err != nil {
+ return nil, err
+ }
+
+ if scanOnly {
+ return nil, nil
+ }
+
+ if p.isCaptureSlot(capnum) {
+ return newRegexNodeM(ntRef, p.options, capnum), nil
+ }
+ if capnum <= 9 && !p.useOptionE() {
+ return nil, p.getErr(ErrUndefinedBackRef, capnum)
+ }
+
+ } else if angled {
+ capname := p.scanCapname()
+
+ if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
+
+ if scanOnly {
+ return nil, nil
+ }
+
+ if p.isCaptureName(capname) {
+ return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
+ }
+ return nil, p.getErr(ErrUndefinedNameRef, capname)
+ } else {
+ if k {
+ return nil, p.getErr(ErrMalformedNameRef)
+ }
+ }
+ }
+
+ // Not backreference: must be char code
+
+ p.textto(backpos)
+ ch, err := p.scanCharEscape()
+ if err != nil {
+ return nil, err
+ }
+
+ if scanOnly {
+ return nil, nil
+ }
+
+ if p.useOptionI() {
+ ch = unicode.ToLower(ch)
+ }
+
+ return newRegexNodeCh(ntOne, p.options, ch), nil
+}
+
+// Scans X for \p{X} or \P{X}
+func (p *parser) parseProperty() (string, error) {
+ if p.charsRight() < 3 {
+ return "", p.getErr(ErrIncompleteSlashP)
+ }
+ ch := p.moveRightGetChar()
+ if ch != '{' {
+ return "", p.getErr(ErrMalformedSlashP)
+ }
+
+ startpos := p.textpos()
+ for p.charsRight() > 0 {
+ ch = p.moveRightGetChar()
+ if !(IsWordChar(ch) || ch == '-') {
+ p.moveLeft()
+ break
+ }
+ }
+ capname := string(p.pattern[startpos:p.textpos()])
+
+ if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
+ return "", p.getErr(ErrIncompleteSlashP)
+ }
+
+ if !isValidUnicodeCat(capname) {
+ return "", p.getErr(ErrUnknownSlashP, capname)
+ }
+
+ return capname, nil
+}
+
+// Returns ReNode type for zero-length assertions with a \ code.
+func (p *parser) typeFromCode(ch rune) nodeType {
+ switch ch {
+ case 'b':
+ if p.useOptionE() {
+ return ntECMABoundary
+ }
+ return ntBoundary
+ case 'B':
+ if p.useOptionE() {
+ return ntNonECMABoundary
+ }
+ return ntNonboundary
+ case 'A':
+ return ntBeginning
+ case 'G':
+ return ntStart
+ case 'Z':
+ return ntEndZ
+ case 'z':
+ return ntEnd
+ default:
+ return ntNothing
+ }
+}
+
+// Scans whitespace or x-mode comments.
+func (p *parser) scanBlank() error {
+ if p.useOptionX() {
+ for {
+ for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
+ p.moveRight(1)
+ }
+
+ if p.charsRight() == 0 {
+ break
+ }
+
+ if p.rightChar(0) == '#' {
+ for p.charsRight() > 0 && p.rightChar(0) != '\n' {
+ p.moveRight(1)
+ }
+ } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
+ p.rightChar(1) == '?' && p.rightChar(0) == '(' {
+ for p.charsRight() > 0 && p.rightChar(0) != ')' {
+ p.moveRight(1)
+ }
+ if p.charsRight() == 0 {
+ return p.getErr(ErrUnterminatedComment)
+ }
+ p.moveRight(1)
+ } else {
+ break
+ }
+ }
+ } else {
+ for {
+ if p.charsRight() < 3 || p.rightChar(2) != '#' ||
+ p.rightChar(1) != '?' || p.rightChar(0) != '(' {
+ return nil
+ }
+
+ for p.charsRight() > 0 && p.rightChar(0) != ')' {
+ p.moveRight(1)
+ }
+ if p.charsRight() == 0 {
+ return p.getErr(ErrUnterminatedComment)
+ }
+ p.moveRight(1)
+ }
+ }
+ return nil
+}
+
+func (p *parser) scanCapname() string {
+ startpos := p.textpos()
+
+ for p.charsRight() > 0 {
+ if !IsWordChar(p.moveRightGetChar()) {
+ p.moveLeft()
+ break
+ }
+ }
+
+ return string(p.pattern[startpos:p.textpos()])
+}
+
+//Scans contents of [] (not including []'s), and converts to a set.
+func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
+ ch := '\x00'
+ chPrev := '\x00'
+ inRange := false
+ firstChar := true
+ closed := false
+
+ var cc *CharSet
+ if !scanOnly {
+ cc = &CharSet{}
+ }
+
+ if p.charsRight() > 0 && p.rightChar(0) == '^' {
+ p.moveRight(1)
+ if !scanOnly {
+ cc.negate = true
+ }
+ }
+
+ for ; p.charsRight() > 0; firstChar = false {
+ fTranslatedChar := false
+ ch = p.moveRightGetChar()
+ if ch == ']' {
+ if !firstChar {
+ closed = true
+ break
+ } else if p.useOptionE() {
+ if !scanOnly {
+ cc.addRanges(NoneClass().ranges)
+ }
+ closed = true
+ break
+ }
+
+ } else if ch == '\\' && p.charsRight() > 0 {
+ switch ch = p.moveRightGetChar(); ch {
+ case 'D', 'd':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+ cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
+ }
+ continue
+
+ case 'S', 's':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+ cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
+ }
+ continue
+
+ case 'W', 'w':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+
+ cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
+ }
+ continue
+
+ case 'p', 'P':
+ if !scanOnly {
+ if inRange {
+ return nil, p.getErr(ErrBadClassInCharRange, ch)
+ }
+ prop, err := p.parseProperty()
+ if err != nil {
+ return nil, err
+ }
+ cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
+ } else {
+ p.parseProperty()
+ }
+
+ continue
+
+ case '-':
+ if !scanOnly {
+ cc.addRange(ch, ch)
+ }
+ continue
+
+ default:
+ p.moveLeft()
+ var err error
+ ch, err = p.scanCharEscape() // non-literal character
+ if err != nil {
+ return nil, err
+ }
+ fTranslatedChar = true
+ break // this break will only break out of the switch
+ }
+ } else if ch == '[' {
+ // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
+ // It currently doesn't do anything other than skip the whole thing!
+ if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
+ savePos := p.textpos()
+
+ p.moveRight(1)
+ negate := false
+ if p.charsRight() > 1 && p.rightChar(0) == '^' {
+ negate = true
+ p.moveRight(1)
+ }
+
+ nm := p.scanCapname() // snag the name
+ if !scanOnly && p.useRE2() {
+ // look up the name since these are valid for RE2
+ // add the group based on the name
+ if ok := cc.addNamedASCII(nm, negate); !ok {
+ return nil, p.getErr(ErrInvalidCharRange)
+ }
+ }
+ if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
+ p.textto(savePos)
+ } else if p.useRE2() {
+ // move on
+ continue
+ }
+ }
+ }
+
+ if inRange {
+ inRange = false
+ if !scanOnly {
+ if ch == '[' && !fTranslatedChar && !firstChar {
+ // We thought we were in a range, but we're actually starting a subtraction.
+ // In that case, we'll add chPrev to our char class, skip the opening [, and
+ // scan the new character class recursively.
+ cc.addChar(chPrev)
+ sub, err := p.scanCharSet(caseInsensitive, false)
+ if err != nil {
+ return nil, err
+ }
+ cc.addSubtraction(sub)
+
+ if p.charsRight() > 0 && p.rightChar(0) != ']' {
+ return nil, p.getErr(ErrSubtractionMustBeLast)
+ }
+ } else {
+ // a regular range, like a-z
+ if chPrev > ch {
+ return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
+ }
+ cc.addRange(chPrev, ch)
+ }
+ }
+ } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
+ // this could be the start of a range
+ chPrev = ch
+ inRange = true
+ p.moveRight(1)
+ } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
+ // we aren't in a range, and now there is a subtraction. Usually this happens
+ // only when a subtraction follows a range, like [a-z-[b]]
+ if !scanOnly {
+ p.moveRight(1)
+ sub, err := p.scanCharSet(caseInsensitive, false)
+ if err != nil {
+ return nil, err
+ }
+ cc.addSubtraction(sub)
+
+ if p.charsRight() > 0 && p.rightChar(0) != ']' {
+ return nil, p.getErr(ErrSubtractionMustBeLast)
+ }
+ } else {
+ p.moveRight(1)
+ p.scanCharSet(caseInsensitive, true)
+ }
+ } else {
+ if !scanOnly {
+ cc.addRange(ch, ch)
+ }
+ }
+ }
+
+ if !closed {
+ return nil, p.getErr(ErrUnterminatedBracket)
+ }
+
+ if !scanOnly && caseInsensitive {
+ cc.addLowercase()
+ }
+
+ return cc, nil
+}
+
+// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
+func (p *parser) scanDecimal() (int, error) {
+ i := 0
+ var d int
+
+ for p.charsRight() > 0 {
+ d = int(p.rightChar(0) - '0')
+ if d < 0 || d > 9 {
+ break
+ }
+ p.moveRight(1)
+
+ if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
+ return 0, p.getErr(ErrCaptureGroupOutOfRange)
+ }
+
+ i *= 10
+ i += d
+ }
+
+ return int(i), nil
+}
+
+// Returns true for options allowed only at the top level
+func isOnlyTopOption(option RegexOptions) bool {
+ return option == RightToLeft || option == ECMAScript || option == RE2
+}
+
+// Scans cimsx-cimsx option string, stops at the first unrecognized char.
+func (p *parser) scanOptions() {
+
+ for off := false; p.charsRight() > 0; p.moveRight(1) {
+ ch := p.rightChar(0)
+
+ if ch == '-' {
+ off = true
+ } else if ch == '+' {
+ off = false
+ } else {
+ option := optionFromCode(ch)
+ if option == 0 || isOnlyTopOption(option) {
+ return
+ }
+
+ if off {
+ p.options &= ^option
+ } else {
+ p.options |= option
+ }
+ }
+ }
+}
+
+// Scans \ code for escape codes that map to single unicode chars.
+func (p *parser) scanCharEscape() (r rune, err error) {
+
+ ch := p.moveRightGetChar()
+
+ if ch >= '0' && ch <= '7' {
+ p.moveLeft()
+ return p.scanOctal(), nil
+ }
+
+ pos := p.textpos()
+
+ switch ch {
+ case 'x':
+ // support for \x{HEX} syntax from Perl and PCRE
+ if p.charsRight() > 0 && p.rightChar(0) == '{' {
+ if p.useOptionE() {
+ return ch, nil
+ }
+ p.moveRight(1)
+ return p.scanHexUntilBrace()
+ } else {
+ r, err = p.scanHex(2)
+ }
+ case 'u':
+ // ECMAscript suppot \u{HEX} only if `u` is also set
+ if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
+ p.moveRight(1)
+ return p.scanHexUntilBrace()
+ } else {
+ r, err = p.scanHex(4)
+ }
+ case 'a':
+ return '\u0007', nil
+ case 'b':
+ return '\b', nil
+ case 'e':
+ return '\u001B', nil
+ case 'f':
+ return '\f', nil
+ case 'n':
+ return '\n', nil
+ case 'r':
+ return '\r', nil
+ case 't':
+ return '\t', nil
+ case 'v':
+ return '\u000B', nil
+ case 'c':
+ r, err = p.scanControl()
+ default:
+ if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
+ return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
+ }
+ return ch, nil
+ }
+ if err != nil && p.useOptionE() {
+ p.textto(pos)
+ return ch, nil
+ }
+ return
+}
+
+// Grabs and converts an ascii control character
+func (p *parser) scanControl() (rune, error) {
+ if p.charsRight() <= 0 {
+ return 0, p.getErr(ErrMissingControl)
+ }
+
+ ch := p.moveRightGetChar()
+
+ // \ca interpreted as \cA
+
+ if ch >= 'a' && ch <= 'z' {
+ ch = (ch - ('a' - 'A'))
+ }
+ ch = (ch - '@')
+ if ch >= 0 && ch < ' ' {
+ return ch, nil
+ }
+
+ return 0, p.getErr(ErrUnrecognizedControl)
+
+}
+
+// Scan hex digits until we hit a closing brace.
+// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
+func (p *parser) scanHexUntilBrace() (rune, error) {
+ // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
+ // so we can enforce that
+ i := 0
+ hasContent := false
+
+ for p.charsRight() > 0 {
+ ch := p.moveRightGetChar()
+ if ch == '}' {
+ // hit our close brace, we're done here
+ // prevent \x{}
+ if !hasContent {
+ return 0, p.getErr(ErrTooFewHex)
+ }
+ return rune(i), nil
+ }
+ hasContent = true
+ // no brace needs to be hex digit
+ d := hexDigit(ch)
+ if d < 0 {
+ return 0, p.getErr(ErrMissingBrace)
+ }
+
+ i *= 0x10
+ i += d
+
+ if i > unicode.MaxRune {
+ return 0, p.getErr(ErrInvalidHex)
+ }
+ }
+
+ // we only make it here if we run out of digits without finding the brace
+ return 0, p.getErr(ErrMissingBrace)
+}
+
+// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
+func (p *parser) scanHex(c int) (rune, error) {
+
+ i := 0
+
+ if p.charsRight() >= c {
+ for c > 0 {
+ d := hexDigit(p.moveRightGetChar())
+ if d < 0 {
+ break
+ }
+ i *= 0x10
+ i += d
+ c--
+ }
+ }
+
+ if c > 0 {
+ return 0, p.getErr(ErrTooFewHex)
+ }
+
+ return rune(i), nil
+}
+
+// Returns n <= 0xF for a hex digit.
+func hexDigit(ch rune) int {
+
+ if d := uint(ch - '0'); d <= 9 {
+ return int(d)
+ }
+
+ if d := uint(ch - 'a'); d <= 5 {
+ return int(d + 0xa)
+ }
+
+ if d := uint(ch - 'A'); d <= 5 {
+ return int(d + 0xa)
+ }
+
+ return -1
+}
+
+// Scans up to three octal digits (stops before exceeding 0377).
+func (p *parser) scanOctal() rune {
+ // Consume octal chars only up to 3 digits and value 0377
+
+ c := 3
+
+ if c > p.charsRight() {
+ c = p.charsRight()
+ }
+
+ //we know the first char is good because the caller had to check
+ i := 0
+ d := int(p.rightChar(0) - '0')
+ for c > 0 && d <= 7 && d >= 0 {
+ if i >= 0x20 && p.useOptionE() {
+ break
+ }
+ i *= 8
+ i += d
+ c--
+
+ p.moveRight(1)
+ if !p.rightMost() {
+ d = int(p.rightChar(0) - '0')
+ }
+ }
+
+ // Octal codes only go up to 255. Any larger and the behavior that Perl follows
+ // is simply to truncate the high bits.
+ i &= 0xFF
+
+ return rune(i)
+}
+
+// Returns the current parsing position.
+func (p *parser) textpos() int {
+ return p.currentPos
+}
+
+// Zaps to a specific parsing position.
+func (p *parser) textto(pos int) {
+ p.currentPos = pos
+}
+
+// Returns the char at the right of the current parsing position and advances to the right.
+func (p *parser) moveRightGetChar() rune {
+ ch := p.pattern[p.currentPos]
+ p.currentPos++
+ return ch
+}
+
+// Moves the current position to the right.
+func (p *parser) moveRight(i int) {
+ // default would be 1
+ p.currentPos += i
+}
+
+// Moves the current parsing position one to the left.
+func (p *parser) moveLeft() {
+ p.currentPos--
+}
+
+// Returns the char left of the current parsing position.
+func (p *parser) charAt(i int) rune {
+ return p.pattern[i]
+}
+
+// Returns the char i chars right of the current parsing position.
+func (p *parser) rightChar(i int) rune {
+ // default would be 0
+ return p.pattern[p.currentPos+i]
+}
+
+// Number of characters to the right of the current parsing position.
+func (p *parser) charsRight() int {
+ return len(p.pattern) - p.currentPos
+}
+
+func (p *parser) rightMost() bool {
+ return p.currentPos == len(p.pattern)
+}
+
+// Looks up the slot number for a given name
+func (p *parser) captureSlotFromName(capname string) int {
+ return p.capnames[capname]
+}
+
+// True if the capture slot was noted
+func (p *parser) isCaptureSlot(i int) bool {
+ if p.caps != nil {
+ _, ok := p.caps[i]
+ return ok
+ }
+
+ return (i >= 0 && i < p.capsize)
+}
+
+// Looks up the slot number for a given name
+func (p *parser) isCaptureName(capname string) bool {
+ if p.capnames == nil {
+ return false
+ }
+
+ _, ok := p.capnames[capname]
+ return ok
+}
+
+// option shortcuts
+
+// True if N option disabling '(' autocapture is on.
+func (p *parser) useOptionN() bool {
+ return (p.options & ExplicitCapture) != 0
+}
+
+// True if I option enabling case-insensitivity is on.
+func (p *parser) useOptionI() bool {
+ return (p.options & IgnoreCase) != 0
+}
+
+// True if M option altering meaning of $ and ^ is on.
+func (p *parser) useOptionM() bool {
+ return (p.options & Multiline) != 0
+}
+
+// True if S option altering meaning of . is on.
+func (p *parser) useOptionS() bool {
+ return (p.options & Singleline) != 0
+}
+
+// True if X option enabling whitespace/comment mode is on.
+func (p *parser) useOptionX() bool {
+ return (p.options & IgnorePatternWhitespace) != 0
+}
+
+// True if E option enabling ECMAScript behavior on.
+func (p *parser) useOptionE() bool {
+ return (p.options & ECMAScript) != 0
+}
+
+// true to use RE2 compatibility parsing behavior.
+func (p *parser) useRE2() bool {
+ return (p.options & RE2) != 0
+}
+
+// True if U option enabling ECMAScript's Unicode behavior on.
+func (p *parser) useOptionU() bool {
+ return (p.options & Unicode) != 0
+}
+
+// True if options stack is empty.
+func (p *parser) emptyOptionsStack() bool {
+ return len(p.optionsStack) == 0
+}
+
+// Finish the current quantifiable (when a quantifier is not found or is not possible)
+func (p *parser) addConcatenate() {
+ // The first (| inside a Testgroup group goes directly to the group
+ p.concatenation.addChild(p.unit)
+ p.unit = nil
+}
+
+// Finish the current quantifiable (when a quantifier is found)
+func (p *parser) addConcatenate3(lazy bool, min, max int) {
+ p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
+ p.unit = nil
+}
+
+// Sets the current unit to a single char node
+func (p *parser) addUnitOne(ch rune) {
+ if p.useOptionI() {
+ ch = unicode.ToLower(ch)
+ }
+
+ p.unit = newRegexNodeCh(ntOne, p.options, ch)
+}
+
+// Sets the current unit to a single inverse-char node
+func (p *parser) addUnitNotone(ch rune) {
+ if p.useOptionI() {
+ ch = unicode.ToLower(ch)
+ }
+
+ p.unit = newRegexNodeCh(ntNotone, p.options, ch)
+}
+
+// Sets the current unit to a single set node
+func (p *parser) addUnitSet(set *CharSet) {
+ p.unit = newRegexNodeSet(ntSet, p.options, set)
+}
+
+// Sets the current unit to a subtree
+func (p *parser) addUnitNode(node *regexNode) {
+ p.unit = node
+}
+
+// Sets the current unit to an assertion of the specified type
+func (p *parser) addUnitType(t nodeType) {
+ p.unit = newRegexNode(t, p.options)
+}
+
+// Finish the current group (in response to a ')' or end)
+func (p *parser) addGroup() error {
+ if p.group.t == ntTestgroup || p.group.t == ntTestref {
+ p.group.addChild(p.concatenation.reverseLeft())
+ if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
+ return p.getErr(ErrTooManyAlternates)
+ }
+ } else {
+ p.alternation.addChild(p.concatenation.reverseLeft())
+ p.group.addChild(p.alternation)
+ }
+
+ p.unit = p.group
+ return nil
+}
+
+// Pops the option stack, but keeps the current options unchanged.
+func (p *parser) popKeepOptions() {
+ lastIdx := len(p.optionsStack) - 1
+ p.optionsStack = p.optionsStack[:lastIdx]
+}
+
+// Recalls options from the stack.
+func (p *parser) popOptions() {
+ lastIdx := len(p.optionsStack) - 1
+ // get the last item on the stack and then remove it by reslicing
+ p.options = p.optionsStack[lastIdx]
+ p.optionsStack = p.optionsStack[:lastIdx]
+}
+
+// Saves options on a stack.
+func (p *parser) pushOptions() {
+ p.optionsStack = append(p.optionsStack, p.options)
+}
+
+// Add a string to the last concatenate.
+func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
+ var node *regexNode
+
+ if cch == 0 {
+ return
+ }
+
+ if cch > 1 {
+ str := make([]rune, cch)
+ copy(str, p.pattern[pos:pos+cch])
+
+ if p.useOptionI() && !isReplacement {
+ // We do the ToLower character by character for consistency. With surrogate chars, doing
+ // a ToLower on the entire string could actually change the surrogate pair. This is more correct
+ // linguistically, but since Regex doesn't support surrogates, it's more important to be
+ // consistent.
+ for i := 0; i < len(str); i++ {
+ str[i] = unicode.ToLower(str[i])
+ }
+ }
+
+ node = newRegexNodeStr(ntMulti, p.options, str)
+ } else {
+ ch := p.charAt(pos)
+
+ if p.useOptionI() && !isReplacement {
+ ch = unicode.ToLower(ch)
+ }
+
+ node = newRegexNodeCh(ntOne, p.options, ch)
+ }
+
+ p.concatenation.addChild(node)
+}
+
+// Push the parser state (in response to an open paren)
+func (p *parser) pushGroup() {
+ p.group.next = p.stack
+ p.alternation.next = p.group
+ p.concatenation.next = p.alternation
+ p.stack = p.concatenation
+}
+
+// Remember the pushed state (in response to a ')')
+func (p *parser) popGroup() error {
+ p.concatenation = p.stack
+ p.alternation = p.concatenation.next
+ p.group = p.alternation.next
+ p.stack = p.group.next
+
+ // The first () inside a Testgroup group goes directly to the group
+ if p.group.t == ntTestgroup && len(p.group.children) == 0 {
+ if p.unit == nil {
+ return p.getErr(ErrConditionalExpression)
+ }
+
+ p.group.addChild(p.unit)
+ p.unit = nil
+ }
+ return nil
+}
+
+// True if the group stack is empty.
+func (p *parser) emptyStack() bool {
+ return p.stack == nil
+}
+
+// Start a new round for the parser state (in response to an open paren or string start)
+func (p *parser) startGroup(openGroup *regexNode) {
+ p.group = openGroup
+ p.alternation = newRegexNode(ntAlternate, p.options)
+ p.concatenation = newRegexNode(ntConcatenate, p.options)
+}
+
+// Finish the current concatenation (in response to a |)
+func (p *parser) addAlternate() {
+ // The | parts inside a Testgroup group go directly to the group
+
+ if p.group.t == ntTestgroup || p.group.t == ntTestref {
+ p.group.addChild(p.concatenation.reverseLeft())
+ } else {
+ p.alternation.addChild(p.concatenation.reverseLeft())
+ }
+
+ p.concatenation = newRegexNode(ntConcatenate, p.options)
+}
+
+// For categorizing ascii characters.
+
+const (
+ Q byte = 5 // quantifier
+ S = 4 // ordinary stopper
+ Z = 3 // ScanBlank stopper
+ X = 2 // whitespace
+ E = 1 // should be escaped
+)
+
+var _category = []byte{
+ //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
+ //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
+ //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
+}
+
+func isSpace(ch rune) bool {
+ return (ch <= ' ' && _category[ch] == X)
+}
+
+// Returns true for those characters that terminate a string of ordinary chars.
+func isSpecial(ch rune) bool {
+ return (ch <= '|' && _category[ch] >= S)
+}
+
+// Returns true for those characters that terminate a string of ordinary chars.
+func isStopperX(ch rune) bool {
+ return (ch <= '|' && _category[ch] >= X)
+}
+
+// Returns true for those characters that begin a quantifier.
+func isQuantifier(ch rune) bool {
+ return (ch <= '{' && _category[ch] >= Q)
+}
+
+func (p *parser) isTrueQuantifier() bool {
+ nChars := p.charsRight()
+ if nChars == 0 {
+ return false
+ }
+
+ startpos := p.textpos()
+ ch := p.charAt(startpos)
+ if ch != '{' {
+ return ch <= '{' && _category[ch] >= Q
+ }
+
+ //UGLY: this is ugly -- the original code was ugly too
+ pos := startpos
+ for {
+ nChars--
+ if nChars <= 0 {
+ break
+ }
+ pos++
+ ch = p.charAt(pos)
+ if ch < '0' || ch > '9' {
+ break
+ }
+ }
+
+ if nChars == 0 || pos-startpos == 1 {
+ return false
+ }
+ if ch == '}' {
+ return true
+ }
+ if ch != ',' {
+ return false
+ }
+ for {
+ nChars--
+ if nChars <= 0 {
+ break
+ }
+ pos++
+ ch = p.charAt(pos)
+ if ch < '0' || ch > '9' {
+ break
+ }
+ }
+
+ return nChars > 0 && ch == '}'
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/prefix.go b/vendor/github.com/dlclark/regexp2/syntax/prefix.go
new file mode 100644
index 0000000..f671688
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/prefix.go
@@ -0,0 +1,896 @@
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "strconv"
+ "unicode"
+ "unicode/utf8"
+)
+
+type Prefix struct {
+ PrefixStr []rune
+ PrefixSet CharSet
+ CaseInsensitive bool
+}
+
+// It takes a RegexTree and computes the set of chars that can start it.
+func getFirstCharsPrefix(tree *RegexTree) *Prefix {
+ s := regexFcd{
+ fcStack: make([]regexFc, 32),
+ intStack: make([]int, 32),
+ }
+ fc := s.regexFCFromRegexTree(tree)
+
+ if fc == nil || fc.nullable || fc.cc.IsEmpty() {
+ return nil
+ }
+ fcSet := fc.getFirstChars()
+ return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
+}
+
+type regexFcd struct {
+ intStack []int
+ intDepth int
+ fcStack []regexFc
+ fcDepth int
+ skipAllChildren bool // don't process any more children at the current level
+ skipchild bool // don't process the current child.
+ failed bool
+}
+
+/*
+ * The main FC computation. It does a shortcutted depth-first walk
+ * through the tree and calls CalculateFC to emits code before
+ * and after each child of an interior node, and at each leaf.
+ */
+func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
+ curNode := tree.root
+ curChild := 0
+
+ for {
+ if len(curNode.children) == 0 {
+ // This is a leaf node
+ s.calculateFC(curNode.t, curNode, 0)
+ } else if curChild < len(curNode.children) && !s.skipAllChildren {
+ // This is an interior node, and we have more children to analyze
+ s.calculateFC(curNode.t|beforeChild, curNode, curChild)
+
+ if !s.skipchild {
+ curNode = curNode.children[curChild]
+ // this stack is how we get a depth first walk of the tree.
+ s.pushInt(curChild)
+ curChild = 0
+ } else {
+ curChild++
+ s.skipchild = false
+ }
+ continue
+ }
+
+ // This is an interior node where we've finished analyzing all the children, or
+ // the end of a leaf node.
+ s.skipAllChildren = false
+
+ if s.intIsEmpty() {
+ break
+ }
+
+ curChild = s.popInt()
+ curNode = curNode.next
+
+ s.calculateFC(curNode.t|afterChild, curNode, curChild)
+ if s.failed {
+ return nil
+ }
+
+ curChild++
+ }
+
+ if s.fcIsEmpty() {
+ return nil
+ }
+
+ return s.popFC()
+}
+
+// To avoid recursion, we use a simple integer stack.
+// This is the push.
+func (s *regexFcd) pushInt(I int) {
+ if s.intDepth >= len(s.intStack) {
+ expanded := make([]int, s.intDepth*2)
+ copy(expanded, s.intStack)
+ s.intStack = expanded
+ }
+
+ s.intStack[s.intDepth] = I
+ s.intDepth++
+}
+
+// True if the stack is empty.
+func (s *regexFcd) intIsEmpty() bool {
+ return s.intDepth == 0
+}
+
+// This is the pop.
+func (s *regexFcd) popInt() int {
+ s.intDepth--
+ return s.intStack[s.intDepth]
+}
+
+// We also use a stack of RegexFC objects.
+// This is the push.
+func (s *regexFcd) pushFC(fc regexFc) {
+ if s.fcDepth >= len(s.fcStack) {
+ expanded := make([]regexFc, s.fcDepth*2)
+ copy(expanded, s.fcStack)
+ s.fcStack = expanded
+ }
+
+ s.fcStack[s.fcDepth] = fc
+ s.fcDepth++
+}
+
+// True if the stack is empty.
+func (s *regexFcd) fcIsEmpty() bool {
+ return s.fcDepth == 0
+}
+
+// This is the pop.
+func (s *regexFcd) popFC() *regexFc {
+ s.fcDepth--
+ return &s.fcStack[s.fcDepth]
+}
+
+// This is the top.
+func (s *regexFcd) topFC() *regexFc {
+ return &s.fcStack[s.fcDepth-1]
+}
+
+// Called in Beforechild to prevent further processing of the current child
+func (s *regexFcd) skipChild() {
+ s.skipchild = true
+}
+
+// FC computation and shortcut cases for each node type
+func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
+ //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
+ ci := false
+ rtl := false
+
+ if nt <= ntRef {
+ if (node.options & IgnoreCase) != 0 {
+ ci = true
+ }
+ if (node.options & RightToLeft) != 0 {
+ rtl = true
+ }
+ }
+
+ switch nt {
+ case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
+ break
+
+ case ntTestgroup | beforeChild:
+ if CurIndex == 0 {
+ s.skipChild()
+ }
+ break
+
+ case ntEmpty:
+ s.pushFC(regexFc{nullable: true})
+ break
+
+ case ntConcatenate | afterChild:
+ if CurIndex != 0 {
+ child := s.popFC()
+ cumul := s.topFC()
+
+ s.failed = !cumul.addFC(*child, true)
+ }
+
+ fc := s.topFC()
+ if !fc.nullable {
+ s.skipAllChildren = true
+ }
+ break
+
+ case ntTestgroup | afterChild:
+ if CurIndex > 1 {
+ child := s.popFC()
+ cumul := s.topFC()
+
+ s.failed = !cumul.addFC(*child, false)
+ }
+ break
+
+ case ntAlternate | afterChild, ntTestref | afterChild:
+ if CurIndex != 0 {
+ child := s.popFC()
+ cumul := s.topFC()
+
+ s.failed = !cumul.addFC(*child, false)
+ }
+ break
+
+ case ntLoop | afterChild, ntLazyloop | afterChild:
+ if node.m == 0 {
+ fc := s.topFC()
+ fc.nullable = true
+ }
+ break
+
+ case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
+ break
+
+ case ntRequire | beforeChild, ntPrevent | beforeChild:
+ s.skipChild()
+ s.pushFC(regexFc{nullable: true})
+ break
+
+ case ntRequire | afterChild, ntPrevent | afterChild:
+ break
+
+ case ntOne, ntNotone:
+ s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
+ break
+
+ case ntOneloop, ntOnelazy:
+ s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
+ break
+
+ case ntNotoneloop, ntNotonelazy:
+ s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
+ break
+
+ case ntMulti:
+ if len(node.str) == 0 {
+ s.pushFC(regexFc{nullable: true})
+ } else if !rtl {
+ s.pushFC(newRegexFc(node.str[0], false, false, ci))
+ } else {
+ s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
+ }
+ break
+
+ case ntSet:
+ s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
+ break
+
+ case ntSetloop, ntSetlazy:
+ s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
+ break
+
+ case ntRef:
+ s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
+ break
+
+ case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
+ s.pushFC(regexFc{nullable: true})
+ break
+
+ default:
+ panic(fmt.Sprintf("unexpected op code: %v", nt))
+ }
+}
+
+type regexFc struct {
+ cc CharSet
+ nullable bool
+ caseInsensitive bool
+}
+
+func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
+ r := regexFc{
+ caseInsensitive: caseInsensitive,
+ nullable: nullable,
+ }
+ if not {
+ if ch > 0 {
+ r.cc.addRange('\x00', ch-1)
+ }
+ if ch < 0xFFFF {
+ r.cc.addRange(ch+1, utf8.MaxRune)
+ }
+ } else {
+ r.cc.addRange(ch, ch)
+ }
+ return r
+}
+
+func (r *regexFc) getFirstChars() CharSet {
+ if r.caseInsensitive {
+ r.cc.addLowercase()
+ }
+
+ return r.cc
+}
+
+func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
+ if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
+ return false
+ }
+
+ if concatenate {
+ if !r.nullable {
+ return true
+ }
+
+ if !fc.nullable {
+ r.nullable = false
+ }
+ } else {
+ if fc.nullable {
+ r.nullable = true
+ }
+ }
+
+ r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
+ r.cc.addSet(fc.cc)
+
+ return true
+}
+
+// This is a related computation: it takes a RegexTree and computes the
+// leading substring if it sees one. It's quite trivial and gives up easily.
+func getPrefix(tree *RegexTree) *Prefix {
+ var concatNode *regexNode
+ nextChild := 0
+
+ curNode := tree.root
+
+ for {
+ switch curNode.t {
+ case ntConcatenate:
+ if len(curNode.children) > 0 {
+ concatNode = curNode
+ nextChild = 0
+ }
+
+ case ntGreedy, ntCapture:
+ curNode = curNode.children[0]
+ concatNode = nil
+ continue
+
+ case ntOneloop, ntOnelazy:
+ if curNode.m > 0 {
+ return &Prefix{
+ PrefixStr: repeat(curNode.ch, curNode.m),
+ CaseInsensitive: (curNode.options & IgnoreCase) != 0,
+ }
+ }
+ return nil
+
+ case ntOne:
+ return &Prefix{
+ PrefixStr: []rune{curNode.ch},
+ CaseInsensitive: (curNode.options & IgnoreCase) != 0,
+ }
+
+ case ntMulti:
+ return &Prefix{
+ PrefixStr: curNode.str,
+ CaseInsensitive: (curNode.options & IgnoreCase) != 0,
+ }
+
+ case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
+ ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
+
+ default:
+ return nil
+ }
+
+ if concatNode == nil || nextChild >= len(concatNode.children) {
+ return nil
+ }
+
+ curNode = concatNode.children[nextChild]
+ nextChild++
+ }
+}
+
+// repeat the rune r, c times... up to the max of MaxPrefixSize
+func repeat(r rune, c int) []rune {
+ if c > MaxPrefixSize {
+ c = MaxPrefixSize
+ }
+
+ ret := make([]rune, c)
+
+ // binary growth using copy for speed
+ ret[0] = r
+ bp := 1
+ for bp < len(ret) {
+ copy(ret[bp:], ret[:bp])
+ bp *= 2
+ }
+
+ return ret
+}
+
+// BmPrefix precomputes the Boyer-Moore
+// tables for fast string scanning. These tables allow
+// you to scan for the first occurrence of a string within
+// a large body of text without examining every character.
+// The performance of the heuristic depends on the actual
+// string and the text being searched, but usually, the longer
+// the string that is being searched for, the fewer characters
+// need to be examined.
+type BmPrefix struct {
+ positive []int
+ negativeASCII []int
+ negativeUnicode [][]int
+ pattern []rune
+ lowASCII rune
+ highASCII rune
+ rightToLeft bool
+ caseInsensitive bool
+}
+
+func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
+
+ b := &BmPrefix{
+ rightToLeft: rightToLeft,
+ caseInsensitive: caseInsensitive,
+ pattern: pattern,
+ }
+
+ if caseInsensitive {
+ for i := 0; i < len(b.pattern); i++ {
+ // We do the ToLower character by character for consistency. With surrogate chars, doing
+ // a ToLower on the entire string could actually change the surrogate pair. This is more correct
+ // linguistically, but since Regex doesn't support surrogates, it's more important to be
+ // consistent.
+
+ b.pattern[i] = unicode.ToLower(b.pattern[i])
+ }
+ }
+
+ var beforefirst, last, bump int
+ var scan, match int
+
+ if !rightToLeft {
+ beforefirst = -1
+ last = len(b.pattern) - 1
+ bump = 1
+ } else {
+ beforefirst = len(b.pattern)
+ last = 0
+ bump = -1
+ }
+
+ // PART I - the good-suffix shift table
+ //
+ // compute the positive requirement:
+ // if char "i" is the first one from the right that doesn't match,
+ // then we know the matcher can advance by _positive[i].
+ //
+ // This algorithm is a simplified variant of the standard
+ // Boyer-Moore good suffix calculation.
+
+ b.positive = make([]int, len(b.pattern))
+
+ examine := last
+ ch := b.pattern[examine]
+ b.positive[examine] = bump
+ examine -= bump
+
+Outerloop:
+ for {
+ // find an internal char (examine) that matches the tail
+
+ for {
+ if examine == beforefirst {
+ break Outerloop
+ }
+ if b.pattern[examine] == ch {
+ break
+ }
+ examine -= bump
+ }
+
+ match = last
+ scan = examine
+
+ // find the length of the match
+ for {
+ if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
+ // at the end of the match, note the difference in _positive
+ // this is not the length of the match, but the distance from the internal match
+ // to the tail suffix.
+ if b.positive[match] == 0 {
+ b.positive[match] = match - scan
+ }
+
+ // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
+
+ break
+ }
+
+ scan -= bump
+ match -= bump
+ }
+
+ examine -= bump
+ }
+
+ match = last - bump
+
+ // scan for the chars for which there are no shifts that yield a different candidate
+
+ // The inside of the if statement used to say
+ // "_positive[match] = last - beforefirst;"
+ // This is slightly less aggressive in how much we skip, but at worst it
+ // should mean a little more work rather than skipping a potential match.
+ for match != beforefirst {
+ if b.positive[match] == 0 {
+ b.positive[match] = bump
+ }
+
+ match -= bump
+ }
+
+ // PART II - the bad-character shift table
+ //
+ // compute the negative requirement:
+ // if char "ch" is the reject character when testing position "i",
+ // we can slide up by _negative[ch];
+ // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
+ //
+ // the lookup table is divided into ASCII and Unicode portions;
+ // only those parts of the Unicode 16-bit code set that actually
+ // appear in the string are in the table. (Maximum size with
+ // Unicode is 65K; ASCII only case is 512 bytes.)
+
+ b.negativeASCII = make([]int, 128)
+
+ for i := 0; i < len(b.negativeASCII); i++ {
+ b.negativeASCII[i] = last - beforefirst
+ }
+
+ b.lowASCII = 127
+ b.highASCII = 0
+
+ for examine = last; examine != beforefirst; examine -= bump {
+ ch = b.pattern[examine]
+
+ switch {
+ case ch < 128:
+ if b.lowASCII > ch {
+ b.lowASCII = ch
+ }
+
+ if b.highASCII < ch {
+ b.highASCII = ch
+ }
+
+ if b.negativeASCII[ch] == last-beforefirst {
+ b.negativeASCII[ch] = last - examine
+ }
+ case ch <= 0xffff:
+ i, j := ch>>8, ch&0xFF
+
+ if b.negativeUnicode == nil {
+ b.negativeUnicode = make([][]int, 256)
+ }
+
+ if b.negativeUnicode[i] == nil {
+ newarray := make([]int, 256)
+
+ for k := 0; k < len(newarray); k++ {
+ newarray[k] = last - beforefirst
+ }
+
+ if i == 0 {
+ copy(newarray, b.negativeASCII)
+ //TODO: this line needed?
+ b.negativeASCII = newarray
+ }
+
+ b.negativeUnicode[i] = newarray
+ }
+
+ if b.negativeUnicode[i][j] == last-beforefirst {
+ b.negativeUnicode[i][j] = last - examine
+ }
+ default:
+ // we can't do the filter because this algo doesn't support
+ // unicode chars >0xffff
+ return nil
+ }
+ }
+
+ return b
+}
+
+func (b *BmPrefix) String() string {
+ return string(b.pattern)
+}
+
+// Dump returns the contents of the filter as a human readable string
+func (b *BmPrefix) Dump(indent string) string {
+ buf := &bytes.Buffer{}
+
+ fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
+ for i := 0; i < len(b.positive); i++ {
+ buf.WriteString(strconv.Itoa(b.positive[i]))
+ buf.WriteRune(' ')
+ }
+ buf.WriteRune('\n')
+
+ if b.negativeASCII != nil {
+ buf.WriteString(indent)
+ buf.WriteString("Negative table\n")
+ for i := 0; i < len(b.negativeASCII); i++ {
+ if b.negativeASCII[i] != len(b.pattern) {
+ fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
+ }
+ }
+ }
+
+ return buf.String()
+}
+
+// Scan uses the Boyer-Moore algorithm to find the first occurrence
+// of the specified string within text, beginning at index, and
+// constrained within beglimit and endlimit.
+//
+// The direction and case-sensitivity of the match is determined
+// by the arguments to the RegexBoyerMoore constructor.
+func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
+ var (
+ defadv, test, test2 int
+ match, startmatch, endmatch int
+ bump, advance int
+ chTest rune
+ unicodeLookup []int
+ )
+
+ if !b.rightToLeft {
+ defadv = len(b.pattern)
+ startmatch = len(b.pattern) - 1
+ endmatch = 0
+ test = index + defadv - 1
+ bump = 1
+ } else {
+ defadv = -len(b.pattern)
+ startmatch = 0
+ endmatch = -defadv - 1
+ test = index + defadv
+ bump = -1
+ }
+
+ chMatch := b.pattern[startmatch]
+
+ for {
+ if test >= endlimit || test < beglimit {
+ return -1
+ }
+
+ chTest = text[test]
+
+ if b.caseInsensitive {
+ chTest = unicode.ToLower(chTest)
+ }
+
+ if chTest != chMatch {
+ if chTest < 128 {
+ advance = b.negativeASCII[chTest]
+ } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
+ unicodeLookup = b.negativeUnicode[chTest>>8]
+ if len(unicodeLookup) > 0 {
+ advance = unicodeLookup[chTest&0xFF]
+ } else {
+ advance = defadv
+ }
+ } else {
+ advance = defadv
+ }
+
+ test += advance
+ } else { // if (chTest == chMatch)
+ test2 = test
+ match = startmatch
+
+ for {
+ if match == endmatch {
+ if b.rightToLeft {
+ return test2 + 1
+ } else {
+ return test2
+ }
+ }
+
+ match -= bump
+ test2 -= bump
+
+ chTest = text[test2]
+
+ if b.caseInsensitive {
+ chTest = unicode.ToLower(chTest)
+ }
+
+ if chTest != b.pattern[match] {
+ advance = b.positive[match]
+ if chTest < 128 {
+ test2 = (match - startmatch) + b.negativeASCII[chTest]
+ } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
+ unicodeLookup = b.negativeUnicode[chTest>>8]
+ if len(unicodeLookup) > 0 {
+ test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
+ } else {
+ test += advance
+ break
+ }
+ } else {
+ test += advance
+ break
+ }
+
+ if b.rightToLeft {
+ if test2 < advance {
+ advance = test2
+ }
+ } else if test2 > advance {
+ advance = test2
+ }
+
+ test += advance
+ break
+ }
+ }
+ }
+ }
+}
+
+// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
+func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
+ if !b.rightToLeft {
+ if index < beglimit || endlimit-index < len(b.pattern) {
+ return false
+ }
+
+ return b.matchPattern(text, index)
+ } else {
+ if index > endlimit || index-beglimit < len(b.pattern) {
+ return false
+ }
+
+ return b.matchPattern(text, index-len(b.pattern))
+ }
+}
+
+func (b *BmPrefix) matchPattern(text []rune, index int) bool {
+ if len(text)-index < len(b.pattern) {
+ return false
+ }
+
+ if b.caseInsensitive {
+ for i := 0; i < len(b.pattern); i++ {
+ //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
+ if unicode.ToLower(text[index+i]) != b.pattern[i] {
+ return false
+ }
+ }
+ return true
+ } else {
+ for i := 0; i < len(b.pattern); i++ {
+ if text[index+i] != b.pattern[i] {
+ return false
+ }
+ }
+ return true
+ }
+}
+
+type AnchorLoc int16
+
+// where the regex can be pegged
+const (
+ AnchorBeginning AnchorLoc = 0x0001
+ AnchorBol = 0x0002
+ AnchorStart = 0x0004
+ AnchorEol = 0x0008
+ AnchorEndZ = 0x0010
+ AnchorEnd = 0x0020
+ AnchorBoundary = 0x0040
+ AnchorECMABoundary = 0x0080
+)
+
+func getAnchors(tree *RegexTree) AnchorLoc {
+
+ var concatNode *regexNode
+ nextChild, result := 0, AnchorLoc(0)
+
+ curNode := tree.root
+
+ for {
+ switch curNode.t {
+ case ntConcatenate:
+ if len(curNode.children) > 0 {
+ concatNode = curNode
+ nextChild = 0
+ }
+
+ case ntGreedy, ntCapture:
+ curNode = curNode.children[0]
+ concatNode = nil
+ continue
+
+ case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
+ ntStart, ntEndZ, ntEnd:
+ return result | anchorFromType(curNode.t)
+
+ case ntEmpty, ntRequire, ntPrevent:
+
+ default:
+ return result
+ }
+
+ if concatNode == nil || nextChild >= len(concatNode.children) {
+ return result
+ }
+
+ curNode = concatNode.children[nextChild]
+ nextChild++
+ }
+}
+
+func anchorFromType(t nodeType) AnchorLoc {
+ switch t {
+ case ntBol:
+ return AnchorBol
+ case ntEol:
+ return AnchorEol
+ case ntBoundary:
+ return AnchorBoundary
+ case ntECMABoundary:
+ return AnchorECMABoundary
+ case ntBeginning:
+ return AnchorBeginning
+ case ntStart:
+ return AnchorStart
+ case ntEndZ:
+ return AnchorEndZ
+ case ntEnd:
+ return AnchorEnd
+ default:
+ return 0
+ }
+}
+
+// anchorDescription returns a human-readable description of the anchors
+func (anchors AnchorLoc) String() string {
+ buf := &bytes.Buffer{}
+
+ if 0 != (anchors & AnchorBeginning) {
+ buf.WriteString(", Beginning")
+ }
+ if 0 != (anchors & AnchorStart) {
+ buf.WriteString(", Start")
+ }
+ if 0 != (anchors & AnchorBol) {
+ buf.WriteString(", Bol")
+ }
+ if 0 != (anchors & AnchorBoundary) {
+ buf.WriteString(", Boundary")
+ }
+ if 0 != (anchors & AnchorECMABoundary) {
+ buf.WriteString(", ECMABoundary")
+ }
+ if 0 != (anchors & AnchorEol) {
+ buf.WriteString(", Eol")
+ }
+ if 0 != (anchors & AnchorEnd) {
+ buf.WriteString(", End")
+ }
+ if 0 != (anchors & AnchorEndZ) {
+ buf.WriteString(", EndZ")
+ }
+
+ // trim off comma
+ if buf.Len() >= 2 {
+ return buf.String()[2:]
+ }
+ return "None"
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
new file mode 100644
index 0000000..bcf4d3f
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
@@ -0,0 +1,87 @@
+package syntax
+
+import (
+ "bytes"
+ "errors"
+)
+
+type ReplacerData struct {
+ Rep string
+ Strings []string
+ Rules []int
+}
+
+const (
+ replaceSpecials = 4
+ replaceLeftPortion = -1
+ replaceRightPortion = -2
+ replaceLastGroup = -3
+ replaceWholeString = -4
+)
+
+//ErrReplacementError is a general error during parsing the replacement text
+var ErrReplacementError = errors.New("Replacement pattern error.")
+
+// NewReplacerData will populate a reusable replacer data struct based on the given replacement string
+// and the capture group data from a regexp
+func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) {
+ p := parser{
+ options: op,
+ caps: caps,
+ capsize: capsize,
+ capnames: capnames,
+ }
+ p.setPattern(rep)
+ concat, err := p.scanReplacement()
+ if err != nil {
+ return nil, err
+ }
+
+ if concat.t != ntConcatenate {
+ panic(ErrReplacementError)
+ }
+
+ sb := &bytes.Buffer{}
+ var (
+ strings []string
+ rules []int
+ )
+
+ for _, child := range concat.children {
+ switch child.t {
+ case ntMulti:
+ child.writeStrToBuf(sb)
+
+ case ntOne:
+ sb.WriteRune(child.ch)
+
+ case ntRef:
+ if sb.Len() > 0 {
+ rules = append(rules, len(strings))
+ strings = append(strings, sb.String())
+ sb.Reset()
+ }
+ slot := child.m
+
+ if len(caps) > 0 && slot >= 0 {
+ slot = caps[slot]
+ }
+
+ rules = append(rules, -replaceSpecials-1-slot)
+
+ default:
+ panic(ErrReplacementError)
+ }
+ }
+
+ if sb.Len() > 0 {
+ rules = append(rules, len(strings))
+ strings = append(strings, sb.String())
+ }
+
+ return &ReplacerData{
+ Rep: rep,
+ Strings: strings,
+ Rules: rules,
+ }, nil
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/tree.go b/vendor/github.com/dlclark/regexp2/syntax/tree.go
new file mode 100644
index 0000000..ea28829
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/tree.go
@@ -0,0 +1,654 @@
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+ "strconv"
+)
+
+type RegexTree struct {
+ root *regexNode
+ caps map[int]int
+ capnumlist []int
+ captop int
+ Capnames map[string]int
+ Caplist []string
+ options RegexOptions
+}
+
+// It is built into a parsed tree for a regular expression.
+
+// Implementation notes:
+//
+// Since the node tree is a temporary data structure only used
+// during compilation of the regexp to integer codes, it's
+// designed for clarity and convenience rather than
+// space efficiency.
+//
+// RegexNodes are built into a tree, linked by the n.children list.
+// Each node also has a n.parent and n.ichild member indicating
+// its parent and which child # it is in its parent's list.
+//
+// RegexNodes come in as many types as there are constructs in
+// a regular expression, for example, "concatenate", "alternate",
+// "one", "rept", "group". There are also node types for basic
+// peephole optimizations, e.g., "onerep", "notsetrep", etc.
+//
+// Because perl 5 allows "lookback" groups that scan backwards,
+// each node also gets a "direction". Normally the value of
+// boolean n.backward = false.
+//
+// During parsing, top-level nodes are also stacked onto a parse
+// stack (a stack of trees). For this purpose we have a n.next
+// pointer. [Note that to save a few bytes, we could overload the
+// n.parent pointer instead.]
+//
+// On the parse stack, each tree has a "role" - basically, the
+// nonterminal in the grammar that the parser has currently
+// assigned to the tree. That code is stored in n.role.
+//
+// Finally, some of the different kinds of nodes have data.
+// Two integers (for the looping constructs) are stored in
+// n.operands, an an object (either a string or a set)
+// is stored in n.data
+type regexNode struct {
+ t nodeType
+ children []*regexNode
+ str []rune
+ set *CharSet
+ ch rune
+ m int
+ n int
+ options RegexOptions
+ next *regexNode
+}
+
+type nodeType int32
+
+const (
+ // The following are leaves, and correspond to primitive operations
+
+ ntOnerep nodeType = 0 // lef,back char,min,max a {n}
+ ntNotonerep = 1 // lef,back char,min,max .{n}
+ ntSetrep = 2 // lef,back set,min,max [\d]{n}
+ ntOneloop = 3 // lef,back char,min,max a {,n}
+ ntNotoneloop = 4 // lef,back char,min,max .{,n}
+ ntSetloop = 5 // lef,back set,min,max [\d]{,n}
+ ntOnelazy = 6 // lef,back char,min,max a {,n}?
+ ntNotonelazy = 7 // lef,back char,min,max .{,n}?
+ ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
+ ntOne = 9 // lef char a
+ ntNotone = 10 // lef char [^a]
+ ntSet = 11 // lef set [a-z\s] \w \s \d
+ ntMulti = 12 // lef string abcd
+ ntRef = 13 // lef group \#
+ ntBol = 14 // ^
+ ntEol = 15 // $
+ ntBoundary = 16 // \b
+ ntNonboundary = 17 // \B
+ ntBeginning = 18 // \A
+ ntStart = 19 // \G
+ ntEndZ = 20 // \Z
+ ntEnd = 21 // \Z
+
+ // Interior nodes do not correspond to primitive operations, but
+ // control structures compositing other operations
+
+ // Concat and alternate take n children, and can run forward or backwards
+
+ ntNothing = 22 // []
+ ntEmpty = 23 // ()
+ ntAlternate = 24 // a|b
+ ntConcatenate = 25 // ab
+ ntLoop = 26 // m,x * + ? {,}
+ ntLazyloop = 27 // m,x *? +? ?? {,}?
+ ntCapture = 28 // n ()
+ ntGroup = 29 // (?:)
+ ntRequire = 30 // (?=) (?<=)
+ ntPrevent = 31 // (?!) (?<!)
+ ntGreedy = 32 // (?>) (?<)
+ ntTestref = 33 // (?(n) | )
+ ntTestgroup = 34 // (?(...) | )
+
+ ntECMABoundary = 41 // \b
+ ntNonECMABoundary = 42 // \B
+)
+
+func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ }
+}
+
+func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ ch: ch,
+ }
+}
+
+func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ str: str,
+ }
+}
+
+func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ set: set,
+ }
+}
+
+func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ m: m,
+ }
+}
+func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ m: m,
+ n: n,
+ }
+}
+
+func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
+ for i := 0; i < len(n.str); i++ {
+ buf.WriteRune(n.str[i])
+ }
+}
+
+func (n *regexNode) addChild(child *regexNode) {
+ reduced := child.reduce()
+ n.children = append(n.children, reduced)
+ reduced.next = n
+}
+
+func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
+ newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
+ n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
+}
+
+// removes children including the start but not the end index
+func (n *regexNode) removeChildren(startIndex, endIndex int) {
+ n.children = append(n.children[:startIndex], n.children[endIndex:]...)
+}
+
+// Pass type as OneLazy or OneLoop
+func (n *regexNode) makeRep(t nodeType, min, max int) {
+ n.t += (t - ntOne)
+ n.m = min
+ n.n = max
+}
+
+func (n *regexNode) reduce() *regexNode {
+ switch n.t {
+ case ntAlternate:
+ return n.reduceAlternation()
+
+ case ntConcatenate:
+ return n.reduceConcatenation()
+
+ case ntLoop, ntLazyloop:
+ return n.reduceRep()
+
+ case ntGroup:
+ return n.reduceGroup()
+
+ case ntSet, ntSetloop:
+ return n.reduceSet()
+
+ default:
+ return n
+ }
+}
+
+// Basic optimization. Single-letter alternations can be replaced
+// by faster set specifications, and nested alternations with no
+// intervening operators can be flattened:
+//
+// a|b|c|def|g|h -> [a-c]|def|[gh]
+// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
+func (n *regexNode) reduceAlternation() *regexNode {
+ if len(n.children) == 0 {
+ return newRegexNode(ntNothing, n.options)
+ }
+
+ wasLastSet := false
+ lastNodeCannotMerge := false
+ var optionsLast RegexOptions
+ var i, j int
+
+ for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
+ at := n.children[i]
+
+ if j < i {
+ n.children[j] = at
+ }
+
+ for {
+ if at.t == ntAlternate {
+ for k := 0; k < len(at.children); k++ {
+ at.children[k].next = n
+ }
+ n.insertChildren(i+1, at.children)
+
+ j--
+ } else if at.t == ntSet || at.t == ntOne {
+ // Cannot merge sets if L or I options differ, or if either are negated.
+ optionsAt := at.options & (RightToLeft | IgnoreCase)
+
+ if at.t == ntSet {
+ if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
+ wasLastSet = true
+ lastNodeCannotMerge = !at.set.IsMergeable()
+ optionsLast = optionsAt
+ break
+ }
+ } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
+ wasLastSet = true
+ lastNodeCannotMerge = false
+ optionsLast = optionsAt
+ break
+ }
+
+ // The last node was a Set or a One, we're a Set or One and our options are the same.
+ // Merge the two nodes.
+ j--
+ prev := n.children[j]
+
+ var prevCharClass *CharSet
+ if prev.t == ntOne {
+ prevCharClass = &CharSet{}
+ prevCharClass.addChar(prev.ch)
+ } else {
+ prevCharClass = prev.set
+ }
+
+ if at.t == ntOne {
+ prevCharClass.addChar(at.ch)
+ } else {
+ prevCharClass.addSet(*at.set)
+ }
+
+ prev.t = ntSet
+ prev.set = prevCharClass
+ } else if at.t == ntNothing {
+ j--
+ } else {
+ wasLastSet = false
+ lastNodeCannotMerge = false
+ }
+ break
+ }
+ }
+
+ if j < i {
+ n.removeChildren(j, i)
+ }
+
+ return n.stripEnation(ntNothing)
+}
+
+// Basic optimization. Adjacent strings can be concatenated.
+//
+// (?:abc)(?:def) -> abcdef
+func (n *regexNode) reduceConcatenation() *regexNode {
+ // Eliminate empties and concat adjacent strings/chars
+
+ var optionsLast RegexOptions
+ var optionsAt RegexOptions
+ var i, j int
+
+ if len(n.children) == 0 {
+ return newRegexNode(ntEmpty, n.options)
+ }
+
+ wasLastString := false
+
+ for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
+ var at, prev *regexNode
+
+ at = n.children[i]
+
+ if j < i {
+ n.children[j] = at
+ }
+
+ if at.t == ntConcatenate &&
+ ((at.options & RightToLeft) == (n.options & RightToLeft)) {
+ for k := 0; k < len(at.children); k++ {
+ at.children[k].next = n
+ }
+
+ //insert at.children at i+1 index in n.children
+ n.insertChildren(i+1, at.children)
+
+ j--
+ } else if at.t == ntMulti || at.t == ntOne {
+ // Cannot merge strings if L or I options differ
+ optionsAt = at.options & (RightToLeft | IgnoreCase)
+
+ if !wasLastString || optionsLast != optionsAt {
+ wasLastString = true
+ optionsLast = optionsAt
+ continue
+ }
+
+ j--
+ prev = n.children[j]
+
+ if prev.t == ntOne {
+ prev.t = ntMulti
+ prev.str = []rune{prev.ch}
+ }
+
+ if (optionsAt & RightToLeft) == 0 {
+ if at.t == ntOne {
+ prev.str = append(prev.str, at.ch)
+ } else {
+ prev.str = append(prev.str, at.str...)
+ }
+ } else {
+ if at.t == ntOne {
+ // insert at the front by expanding our slice, copying the data over, and then setting the value
+ prev.str = append(prev.str, 0)
+ copy(prev.str[1:], prev.str)
+ prev.str[0] = at.ch
+ } else {
+ //insert at the front...this one we'll make a new slice and copy both into it
+ merge := make([]rune, len(prev.str)+len(at.str))
+ copy(merge, at.str)
+ copy(merge[len(at.str):], prev.str)
+ prev.str = merge
+ }
+ }
+ } else if at.t == ntEmpty {
+ j--
+ } else {
+ wasLastString = false
+ }
+ }
+
+ if j < i {
+ // remove indices j through i from the children
+ n.removeChildren(j, i)
+ }
+
+ return n.stripEnation(ntEmpty)
+}
+
+// Nested repeaters just get multiplied with each other if they're not
+// too lumpy
+func (n *regexNode) reduceRep() *regexNode {
+
+ u := n
+ t := n.t
+ min := n.m
+ max := n.n
+
+ for {
+ if len(u.children) == 0 {
+ break
+ }
+
+ child := u.children[0]
+
+ // multiply reps of the same type only
+ if child.t != t {
+ childType := child.t
+
+ if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
+ childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
+ break
+ }
+ }
+
+ // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
+ // [but things like (a {2,})+ are not too lumpy...]
+ if u.m == 0 && child.m > 1 || child.n < child.m*2 {
+ break
+ }
+
+ u = child
+ if u.m > 0 {
+ if (math.MaxInt32-1)/u.m < min {
+ u.m = math.MaxInt32
+ } else {
+ u.m = u.m * min
+ }
+ }
+ if u.n > 0 {
+ if (math.MaxInt32-1)/u.n < max {
+ u.n = math.MaxInt32
+ } else {
+ u.n = u.n * max
+ }
+ }
+ }
+
+ if math.MaxInt32 == min {
+ return newRegexNode(ntNothing, n.options)
+ }
+ return u
+
+}
+
+// Simple optimization. If a concatenation or alternation has only
+// one child strip out the intermediate node. If it has zero children,
+// turn it into an empty.
+func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
+ switch len(n.children) {
+ case 0:
+ return newRegexNode(emptyType, n.options)
+ case 1:
+ return n.children[0]
+ default:
+ return n
+ }
+}
+
+func (n *regexNode) reduceGroup() *regexNode {
+ u := n
+
+ for u.t == ntGroup {
+ u = u.children[0]
+ }
+
+ return u
+}
+
+// Simple optimization. If a set is a singleton, an inverse singleton,
+// or empty, it's transformed accordingly.
+func (n *regexNode) reduceSet() *regexNode {
+ // Extract empty-set, one and not-one case as special
+
+ if n.set == nil {
+ n.t = ntNothing
+ } else if n.set.IsSingleton() {
+ n.ch = n.set.SingletonChar()
+ n.set = nil
+ n.t += (ntOne - ntSet)
+ } else if n.set.IsSingletonInverse() {
+ n.ch = n.set.SingletonChar()
+ n.set = nil
+ n.t += (ntNotone - ntSet)
+ }
+
+ return n
+}
+
+func (n *regexNode) reverseLeft() *regexNode {
+ if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
+ //reverse children order
+ for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
+ n.children[left], n.children[right] = n.children[right], n.children[left]
+ }
+ }
+
+ return n
+}
+
+func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
+ if min == 0 && max == 0 {
+ return newRegexNode(ntEmpty, n.options)
+ }
+
+ if min == 1 && max == 1 {
+ return n
+ }
+
+ switch n.t {
+ case ntOne, ntNotone, ntSet:
+ if lazy {
+ n.makeRep(Onelazy, min, max)
+ } else {
+ n.makeRep(Oneloop, min, max)
+ }
+ return n
+
+ default:
+ var t nodeType
+ if lazy {
+ t = ntLazyloop
+ } else {
+ t = ntLoop
+ }
+ result := newRegexNodeMN(t, n.options, min, max)
+ result.addChild(n)
+ return result
+ }
+}
+
+// debug functions
+
+var typeStr = []string{
+ "Onerep", "Notonerep", "Setrep",
+ "Oneloop", "Notoneloop", "Setloop",
+ "Onelazy", "Notonelazy", "Setlazy",
+ "One", "Notone", "Set",
+ "Multi", "Ref",
+ "Bol", "Eol", "Boundary", "Nonboundary",
+ "Beginning", "Start", "EndZ", "End",
+ "Nothing", "Empty",
+ "Alternate", "Concatenate",
+ "Loop", "Lazyloop",
+ "Capture", "Group", "Require", "Prevent", "Greedy",
+ "Testref", "Testgroup",
+ "Unknown", "Unknown", "Unknown",
+ "Unknown", "Unknown", "Unknown",
+ "ECMABoundary", "NonECMABoundary",
+}
+
+func (n *regexNode) description() string {
+ buf := &bytes.Buffer{}
+
+ buf.WriteString(typeStr[n.t])
+
+ if (n.options & ExplicitCapture) != 0 {
+ buf.WriteString("-C")
+ }
+ if (n.options & IgnoreCase) != 0 {
+ buf.WriteString("-I")
+ }
+ if (n.options & RightToLeft) != 0 {
+ buf.WriteString("-L")
+ }
+ if (n.options & Multiline) != 0 {
+ buf.WriteString("-M")
+ }
+ if (n.options & Singleline) != 0 {
+ buf.WriteString("-S")
+ }
+ if (n.options & IgnorePatternWhitespace) != 0 {
+ buf.WriteString("-X")
+ }
+ if (n.options & ECMAScript) != 0 {
+ buf.WriteString("-E")
+ }
+
+ switch n.t {
+ case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
+ buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
+ break
+ case ntCapture:
+ buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
+ break
+ case ntRef, ntTestref:
+ buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
+ break
+ case ntMulti:
+ fmt.Fprintf(buf, "(String = %s)", string(n.str))
+ break
+ case ntSet, ntSetloop, ntSetlazy:
+ buf.WriteString("(Set = " + n.set.String() + ")")
+ break
+ }
+
+ switch n.t {
+ case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
+ buf.WriteString("(Min = ")
+ buf.WriteString(strconv.Itoa(n.m))
+ buf.WriteString(", Max = ")
+ if n.n == math.MaxInt32 {
+ buf.WriteString("inf")
+ } else {
+ buf.WriteString(strconv.Itoa(n.n))
+ }
+ buf.WriteString(")")
+
+ break
+ }
+
+ return buf.String()
+}
+
+var padSpace = []byte(" ")
+
+func (t *RegexTree) Dump() string {
+ return t.root.dump()
+}
+
+func (n *regexNode) dump() string {
+ var stack []int
+ CurNode := n
+ CurChild := 0
+
+ buf := bytes.NewBufferString(CurNode.description())
+ buf.WriteRune('\n')
+
+ for {
+ if CurNode.children != nil && CurChild < len(CurNode.children) {
+ stack = append(stack, CurChild+1)
+ CurNode = CurNode.children[CurChild]
+ CurChild = 0
+
+ Depth := len(stack)
+ if Depth > 32 {
+ Depth = 32
+ }
+ buf.Write(padSpace[:Depth])
+ buf.WriteString(CurNode.description())
+ buf.WriteRune('\n')
+ } else {
+ if len(stack) == 0 {
+ break
+ }
+
+ CurChild = stack[len(stack)-1]
+ stack = stack[:len(stack)-1]
+ CurNode = CurNode.next
+ }
+ }
+ return buf.String()
+}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/writer.go b/vendor/github.com/dlclark/regexp2/syntax/writer.go
new file mode 100644
index 0000000..a5aa11c
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go
@@ -0,0 +1,500 @@
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+ "os"
+)
+
+func Write(tree *RegexTree) (*Code, error) {
+ w := writer{
+ intStack: make([]int, 0, 32),
+ emitted: make([]int, 2),
+ stringhash: make(map[string]int),
+ sethash: make(map[string]int),
+ }
+
+ code, err := w.codeFromTree(tree)
+
+ if tree.options&Debug > 0 && code != nil {
+ os.Stdout.WriteString(code.Dump())
+ os.Stdout.WriteString("\n")
+ }
+
+ return code, err
+}
+
+type writer struct {
+ emitted []int
+
+ intStack []int
+ curpos int
+ stringhash map[string]int
+ stringtable [][]rune
+ sethash map[string]int
+ settable []*CharSet
+ counting bool
+ count int
+ trackcount int
+ caps map[int]int
+}
+
+const (
+ beforeChild nodeType = 64
+ afterChild = 128
+ //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
+ MaxPrefixSize = 50
+)
+
+// The top level RegexCode generator. It does a depth-first walk
+// through the tree and calls EmitFragment to emits code before
+// and after each child of an interior node, and at each leaf.
+//
+// It runs two passes, first to count the size of the generated
+// code, and second to generate the code.
+//
+// We should time it against the alternative, which is
+// to just generate the code and grow the array as we go.
+func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
+ var (
+ curNode *regexNode
+ curChild int
+ capsize int
+ )
+ // construct sparse capnum mapping if some numbers are unused
+
+ if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
+ capsize = tree.captop
+ w.caps = nil
+ } else {
+ capsize = len(tree.capnumlist)
+ w.caps = tree.caps
+ for i := 0; i < len(tree.capnumlist); i++ {
+ w.caps[tree.capnumlist[i]] = i
+ }
+ }
+
+ w.counting = true
+
+ for {
+ if !w.counting {
+ w.emitted = make([]int, w.count)
+ }
+
+ curNode = tree.root
+ curChild = 0
+
+ w.emit1(Lazybranch, 0)
+
+ for {
+ if len(curNode.children) == 0 {
+ w.emitFragment(curNode.t, curNode, 0)
+ } else if curChild < len(curNode.children) {
+ w.emitFragment(curNode.t|beforeChild, curNode, curChild)
+
+ curNode = curNode.children[curChild]
+
+ w.pushInt(curChild)
+ curChild = 0
+ continue
+ }
+
+ if w.emptyStack() {
+ break
+ }
+
+ curChild = w.popInt()
+ curNode = curNode.next
+
+ w.emitFragment(curNode.t|afterChild, curNode, curChild)
+ curChild++
+ }
+
+ w.patchJump(0, w.curPos())
+ w.emit(Stop)
+
+ if !w.counting {
+ break
+ }
+
+ w.counting = false
+ }
+
+ fcPrefix := getFirstCharsPrefix(tree)
+ prefix := getPrefix(tree)
+ rtl := (tree.options & RightToLeft) != 0
+
+ var bmPrefix *BmPrefix
+ //TODO: benchmark string prefixes
+ if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
+ if len(prefix.PrefixStr) > MaxPrefixSize {
+ // limit prefix changes to 10k
+ prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
+ }
+ bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
+ } else {
+ bmPrefix = nil
+ }
+
+ return &Code{
+ Codes: w.emitted,
+ Strings: w.stringtable,
+ Sets: w.settable,
+ TrackCount: w.trackcount,
+ Caps: w.caps,
+ Capsize: capsize,
+ FcPrefix: fcPrefix,
+ BmPrefix: bmPrefix,
+ Anchors: getAnchors(tree),
+ RightToLeft: rtl,
+ }, nil
+}
+
+// The main RegexCode generator. It does a depth-first walk
+// through the tree and calls EmitFragment to emits code before
+// and after each child of an interior node, and at each leaf.
+func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
+ bits := InstOp(0)
+
+ if nodetype <= ntRef {
+ if (node.options & RightToLeft) != 0 {
+ bits |= Rtl
+ }
+ if (node.options & IgnoreCase) != 0 {
+ bits |= Ci
+ }
+ }
+ ntBits := nodeType(bits)
+
+ switch nodetype {
+ case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
+ break
+
+ case ntAlternate | beforeChild:
+ if curIndex < len(node.children)-1 {
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+ }
+
+ case ntAlternate | afterChild:
+ if curIndex < len(node.children)-1 {
+ lbPos := w.popInt()
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ w.patchJump(lbPos, w.curPos())
+ } else {
+ for i := 0; i < curIndex; i++ {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+ }
+ break
+
+ case ntTestref | beforeChild:
+ if curIndex == 0 {
+ w.emit(Setjump)
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+ w.emit1(Testref, w.mapCapnum(node.m))
+ w.emit(Forejump)
+ }
+
+ case ntTestref | afterChild:
+ if curIndex == 0 {
+ branchpos := w.popInt()
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ w.patchJump(branchpos, w.curPos())
+ w.emit(Forejump)
+ if len(node.children) <= 1 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+ } else if curIndex == 1 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+
+ case ntTestgroup | beforeChild:
+ if curIndex == 0 {
+ w.emit(Setjump)
+ w.emit(Setmark)
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+ }
+
+ case ntTestgroup | afterChild:
+ if curIndex == 0 {
+ w.emit(Getmark)
+ w.emit(Forejump)
+ } else if curIndex == 1 {
+ Branchpos := w.popInt()
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ w.patchJump(Branchpos, w.curPos())
+ w.emit(Getmark)
+ w.emit(Forejump)
+ if len(node.children) <= 2 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+ } else if curIndex == 2 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+
+ case ntLoop | beforeChild, ntLazyloop | beforeChild:
+
+ if node.n < math.MaxInt32 || node.m > 1 {
+ if node.m == 0 {
+ w.emit1(Nullcount, 0)
+ } else {
+ w.emit1(Setcount, 1-node.m)
+ }
+ } else if node.m == 0 {
+ w.emit(Nullmark)
+ } else {
+ w.emit(Setmark)
+ }
+
+ if node.m == 0 {
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ }
+ w.pushInt(w.curPos())
+
+ case ntLoop | afterChild, ntLazyloop | afterChild:
+
+ startJumpPos := w.curPos()
+ lazy := (nodetype - (ntLoop | afterChild))
+
+ if node.n < math.MaxInt32 || node.m > 1 {
+ if node.n == math.MaxInt32 {
+ w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
+ } else {
+ w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
+ }
+ } else {
+ w.emit1(InstOp(Branchmark+lazy), w.popInt())
+ }
+
+ if node.m == 0 {
+ w.patchJump(w.popInt(), startJumpPos)
+ }
+
+ case ntGroup | beforeChild, ntGroup | afterChild:
+
+ case ntCapture | beforeChild:
+ w.emit(Setmark)
+
+ case ntCapture | afterChild:
+ w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
+
+ case ntRequire | beforeChild:
+ // NOTE: the following line causes lookahead/lookbehind to be
+ // NON-BACKTRACKING. It can be commented out with (*)
+ w.emit(Setjump)
+
+ w.emit(Setmark)
+
+ case ntRequire | afterChild:
+ w.emit(Getmark)
+
+ // NOTE: the following line causes lookahead/lookbehind to be
+ // NON-BACKTRACKING. It can be commented out with (*)
+ w.emit(Forejump)
+
+ case ntPrevent | beforeChild:
+ w.emit(Setjump)
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+
+ case ntPrevent | afterChild:
+ w.emit(Backjump)
+ w.patchJump(w.popInt(), w.curPos())
+ w.emit(Forejump)
+
+ case ntGreedy | beforeChild:
+ w.emit(Setjump)
+
+ case ntGreedy | afterChild:
+ w.emit(Forejump)
+
+ case ntOne, ntNotone:
+ w.emit1(InstOp(node.t|ntBits), int(node.ch))
+
+ case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
+ if node.m > 0 {
+ if node.t == ntOneloop || node.t == ntOnelazy {
+ w.emit2(Onerep|bits, int(node.ch), node.m)
+ } else {
+ w.emit2(Notonerep|bits, int(node.ch), node.m)
+ }
+ }
+ if node.n > node.m {
+ if node.n == math.MaxInt32 {
+ w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
+ } else {
+ w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
+ }
+ }
+
+ case ntSetloop, ntSetlazy:
+ if node.m > 0 {
+ w.emit2(Setrep|bits, w.setCode(node.set), node.m)
+ }
+ if node.n > node.m {
+ if node.n == math.MaxInt32 {
+ w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
+ } else {
+ w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
+ }
+ }
+
+ case ntMulti:
+ w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
+
+ case ntSet:
+ w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
+
+ case ntRef:
+ w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
+
+ case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
+ w.emit(InstOp(node.t))
+
+ default:
+ return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
+ }
+
+ return nil
+}
+
+// To avoid recursion, we use a simple integer stack.
+// This is the push.
+func (w *writer) pushInt(i int) {
+ w.intStack = append(w.intStack, i)
+}
+
+// Returns true if the stack is empty.
+func (w *writer) emptyStack() bool {
+ return len(w.intStack) == 0
+}
+
+// This is the pop.
+func (w *writer) popInt() int {
+ //get our item
+ idx := len(w.intStack) - 1
+ i := w.intStack[idx]
+ //trim our slice
+ w.intStack = w.intStack[:idx]
+ return i
+}
+
+// Returns the current position in the emitted code.
+func (w *writer) curPos() int {
+ return w.curpos
+}
+
+// Fixes up a jump instruction at the specified offset
+// so that it jumps to the specified jumpDest.
+func (w *writer) patchJump(offset, jumpDest int) {
+ w.emitted[offset+1] = jumpDest
+}
+
+// Returns an index in the set table for a charset
+// uses a map to eliminate duplicates.
+func (w *writer) setCode(set *CharSet) int {
+ if w.counting {
+ return 0
+ }
+
+ buf := &bytes.Buffer{}
+
+ set.mapHashFill(buf)
+ hash := buf.String()
+ i, ok := w.sethash[hash]
+ if !ok {
+ i = len(w.sethash)
+ w.sethash[hash] = i
+ w.settable = append(w.settable, set)
+ }
+ return i
+}
+
+// Returns an index in the string table for a string.
+// uses a map to eliminate duplicates.
+func (w *writer) stringCode(str []rune) int {
+ if w.counting {
+ return 0
+ }
+
+ hash := string(str)
+ i, ok := w.stringhash[hash]
+ if !ok {
+ i = len(w.stringhash)
+ w.stringhash[hash] = i
+ w.stringtable = append(w.stringtable, str)
+ }
+
+ return i
+}
+
+// When generating code on a regex that uses a sparse set
+// of capture slots, we hash them to a dense set of indices
+// for an array of capture slots. Instead of doing the hash
+// at match time, it's done at compile time, here.
+func (w *writer) mapCapnum(capnum int) int {
+ if capnum == -1 {
+ return -1
+ }
+
+ if w.caps != nil {
+ return w.caps[capnum]
+ }
+
+ return capnum
+}
+
+// Emits a zero-argument operation. Note that the emit
+// functions all run in two modes: they can emit code, or
+// they can just count the size of the code.
+func (w *writer) emit(op InstOp) {
+ if w.counting {
+ w.count++
+ if opcodeBacktracks(op) {
+ w.trackcount++
+ }
+ return
+ }
+ w.emitted[w.curpos] = int(op)
+ w.curpos++
+}
+
+// Emits a one-argument operation.
+func (w *writer) emit1(op InstOp, opd1 int) {
+ if w.counting {
+ w.count += 2
+ if opcodeBacktracks(op) {
+ w.trackcount++
+ }
+ return
+ }
+ w.emitted[w.curpos] = int(op)
+ w.curpos++
+ w.emitted[w.curpos] = opd1
+ w.curpos++
+}
+
+// Emits a two-argument operation.
+func (w *writer) emit2(op InstOp, opd1, opd2 int) {
+ if w.counting {
+ w.count += 3
+ if opcodeBacktracks(op) {
+ w.trackcount++
+ }
+ return
+ }
+ w.emitted[w.curpos] = int(op)
+ w.curpos++
+ w.emitted[w.curpos] = opd1
+ w.curpos++
+ w.emitted[w.curpos] = opd2
+ w.curpos++
+}