diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2024-10-25 00:47:47 +0200 |
| commit | c6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch) | |
| tree | 36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/dlclark/regexp2/syntax | |
| parent | 0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff) | |
| download | jbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz | |
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax')
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/charclass.go | 865 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/code.go | 274 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/escape.go | 94 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/fuzz.go | 20 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/parser.go | 2251 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/prefix.go | 896 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/replacerdata.go | 87 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/tree.go | 654 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/writer.go | 500 |
9 files changed, 5641 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/charclass.go b/vendor/github.com/dlclark/regexp2/syntax/charclass.go new file mode 100644 index 0000000..6881a0e --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/charclass.go @@ -0,0 +1,865 @@ +package syntax + +import ( + "bytes" + "encoding/binary" + "fmt" + "sort" + "unicode" + "unicode/utf8" +) + +// CharSet combines start-end rune ranges and unicode categories representing a set of characters +type CharSet struct { + ranges []singleRange + categories []category + sub *CharSet //optional subtractor + negate bool + anything bool +} + +type category struct { + negate bool + cat string +} + +type singleRange struct { + first rune + last rune +} + +const ( + spaceCategoryText = " " + wordCategoryText = "W" +) + +var ( + ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00} + ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b} + ecmaDigit = []rune{0x0030, 0x003a} + + re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021} +) + +var ( + AnyClass = getCharSetFromOldString([]rune{0}, false) + ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) + NoneClass = getCharSetFromOldString(nil, false) + ECMAWordClass = getCharSetFromOldString(ecmaWord, false) + NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) + ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) + NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) + ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) + NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) + + WordClass = getCharSetFromCategoryString(false, false, wordCategoryText) + NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText) + SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText) + NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText) + DigitClass = getCharSetFromCategoryString(false, false, "Nd") + NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") + + RE2SpaceClass = getCharSetFromOldString(re2Space, false) + NotRE2SpaceClass = getCharSetFromOldString(re2Space, true) +) + +var unicodeCategories = func() map[string]*unicode.RangeTable { + retVal := make(map[string]*unicode.RangeTable) + for k, v := range unicode.Scripts { + retVal[k] = v + } + for k, v := range unicode.Categories { + retVal[k] = v + } + for k, v := range unicode.Properties { + retVal[k] = v + } + return retVal +}() + +func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet { + if negateCat && negateSet { + panic("BUG! You should only negate the set OR the category in a constant setup, but not both") + } + + c := CharSet{negate: negateSet} + + c.categories = make([]category, len(cats)) + for i, cat := range cats { + c.categories[i] = category{cat: cat, negate: negateCat} + } + return func() *CharSet { + //make a copy each time + local := c + //return that address + return &local + } +} + +func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet { + c := CharSet{} + if len(setText) > 0 { + fillFirst := false + l := len(setText) + if negate { + if setText[0] == 0 { + setText = setText[1:] + } else { + l++ + fillFirst = true + } + } + + if l%2 == 0 { + c.ranges = make([]singleRange, l/2) + } else { + c.ranges = make([]singleRange, l/2+1) + } + + first := true + if fillFirst { + c.ranges[0] = singleRange{first: 0} + first = false + } + + i := 0 + for _, r := range setText { + if first { + // lower bound in a new range + c.ranges[i] = singleRange{first: r} + first = false + } else { + c.ranges[i].last = r - 1 + i++ + first = true + } + } + if !first { + c.ranges[i].last = utf8.MaxRune + } + } + + return func() *CharSet { + local := c + return &local + } +} + +// Copy makes a deep copy to prevent accidental mutation of a set +func (c CharSet) Copy() CharSet { + ret := CharSet{ + anything: c.anything, + negate: c.negate, + } + + ret.ranges = append(ret.ranges, c.ranges...) + ret.categories = append(ret.categories, c.categories...) + + if c.sub != nil { + sub := c.sub.Copy() + ret.sub = &sub + } + + return ret +} + +// gets a human-readable description for a set string +func (c CharSet) String() string { + buf := &bytes.Buffer{} + buf.WriteRune('[') + + if c.IsNegated() { + buf.WriteRune('^') + } + + for _, r := range c.ranges { + + buf.WriteString(CharDescription(r.first)) + if r.first != r.last { + if r.last-r.first != 1 { + //groups that are 1 char apart skip the dash + buf.WriteRune('-') + } + buf.WriteString(CharDescription(r.last)) + } + } + + for _, c := range c.categories { + buf.WriteString(c.String()) + } + + if c.sub != nil { + buf.WriteRune('-') + buf.WriteString(c.sub.String()) + } + + buf.WriteRune(']') + + return buf.String() +} + +// mapHashFill converts a charset into a buffer for use in maps +func (c CharSet) mapHashFill(buf *bytes.Buffer) { + if c.negate { + buf.WriteByte(0) + } else { + buf.WriteByte(1) + } + + binary.Write(buf, binary.LittleEndian, len(c.ranges)) + binary.Write(buf, binary.LittleEndian, len(c.categories)) + for _, r := range c.ranges { + buf.WriteRune(r.first) + buf.WriteRune(r.last) + } + for _, ct := range c.categories { + buf.WriteString(ct.cat) + if ct.negate { + buf.WriteByte(1) + } else { + buf.WriteByte(0) + } + } + + if c.sub != nil { + c.sub.mapHashFill(buf) + } +} + +// CharIn returns true if the rune is in our character set (either ranges or categories). +// It handles negations and subtracted sub-charsets. +func (c CharSet) CharIn(ch rune) bool { + val := false + // in s && !s.subtracted + + //check ranges + for _, r := range c.ranges { + if ch < r.first { + continue + } + if ch <= r.last { + val = true + break + } + } + + //check categories if we haven't already found a range + if !val && len(c.categories) > 0 { + for _, ct := range c.categories { + // special categories...then unicode + if ct.cat == spaceCategoryText { + if unicode.IsSpace(ch) { + // we found a space so we're done + // negate means this is a "bad" thing + val = !ct.negate + break + } else if ct.negate { + val = true + break + } + } else if ct.cat == wordCategoryText { + if IsWordChar(ch) { + val = !ct.negate + break + } else if ct.negate { + val = true + break + } + } else if unicode.Is(unicodeCategories[ct.cat], ch) { + // if we're in this unicode category then we're done + // if negate=true on this category then we "failed" our test + // otherwise we're good that we found it + val = !ct.negate + break + } else if ct.negate { + val = true + break + } + } + } + + // negate the whole char set + if c.negate { + val = !val + } + + // get subtracted recurse + if val && c.sub != nil { + val = !c.sub.CharIn(ch) + } + + //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val) + return val +} + +func (c category) String() string { + switch c.cat { + case spaceCategoryText: + if c.negate { + return "\\S" + } + return "\\s" + case wordCategoryText: + if c.negate { + return "\\W" + } + return "\\w" + } + if _, ok := unicodeCategories[c.cat]; ok { + + if c.negate { + return "\\P{" + c.cat + "}" + } + return "\\p{" + c.cat + "}" + } + return "Unknown category: " + c.cat +} + +// CharDescription Produces a human-readable description for a single character. +func CharDescription(ch rune) string { + /*if ch == '\\' { + return "\\\\" + } + + if ch > ' ' && ch <= '~' { + return string(ch) + } else if ch == '\n' { + return "\\n" + } else if ch == ' ' { + return "\\ " + }*/ + + b := &bytes.Buffer{} + escape(b, ch, false) //fmt.Sprintf("%U", ch) + return b.String() +} + +// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) +// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic +// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C +// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. +func IsWordChar(r rune) bool { + //"L", "Mn", "Nd", "Pc" + return unicode.In(r, + unicode.Categories["L"], unicode.Categories["Mn"], + unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' + //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' +} + +func IsECMAWordChar(r rune) bool { + return unicode.In(r, + unicode.Categories["L"], unicode.Categories["Mn"], + unicode.Categories["Nd"], unicode.Categories["Pc"]) + + //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' +} + +// SingletonChar will return the char from the first range without validation. +// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input +func (c CharSet) SingletonChar() rune { + return c.ranges[0].first +} + +func (c CharSet) IsSingleton() bool { + return !c.negate && //negated is multiple chars + len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars + c.sub == nil && // subtraction means we've got multiple chars + c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char +} + +func (c CharSet) IsSingletonInverse() bool { + return c.negate && //same as above, but requires negated + len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars + c.sub == nil && // subtraction means we've got multiple chars + c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char +} + +func (c CharSet) IsMergeable() bool { + return !c.IsNegated() && !c.HasSubtraction() +} + +func (c CharSet) IsNegated() bool { + return c.negate +} + +func (c CharSet) HasSubtraction() bool { + return c.sub != nil +} + +func (c CharSet) IsEmpty() bool { + return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil +} + +func (c *CharSet) addDigit(ecma, negate bool, pattern string) { + if ecma { + if negate { + c.addRanges(NotECMADigitClass().ranges) + } else { + c.addRanges(ECMADigitClass().ranges) + } + } else { + c.addCategories(category{cat: "Nd", negate: negate}) + } +} + +func (c *CharSet) addChar(ch rune) { + c.addRange(ch, ch) +} + +func (c *CharSet) addSpace(ecma, re2, negate bool) { + if ecma { + if negate { + c.addRanges(NotECMASpaceClass().ranges) + } else { + c.addRanges(ECMASpaceClass().ranges) + } + } else if re2 { + if negate { + c.addRanges(NotRE2SpaceClass().ranges) + } else { + c.addRanges(RE2SpaceClass().ranges) + } + } else { + c.addCategories(category{cat: spaceCategoryText, negate: negate}) + } +} + +func (c *CharSet) addWord(ecma, negate bool) { + if ecma { + if negate { + c.addRanges(NotECMAWordClass().ranges) + } else { + c.addRanges(ECMAWordClass().ranges) + } + } else { + c.addCategories(category{cat: wordCategoryText, negate: negate}) + } +} + +// Add set ranges and categories into ours -- no deduping or anything +func (c *CharSet) addSet(set CharSet) { + if c.anything { + return + } + if set.anything { + c.makeAnything() + return + } + // just append here to prevent double-canon + c.ranges = append(c.ranges, set.ranges...) + c.addCategories(set.categories...) + c.canonicalize() +} + +func (c *CharSet) makeAnything() { + c.anything = true + c.categories = []category{} + c.ranges = AnyClass().ranges +} + +func (c *CharSet) addCategories(cats ...category) { + // don't add dupes and remove positive+negative + if c.anything { + // if we've had a previous positive+negative group then + // just return, we're as broad as we can get + return + } + + for _, ct := range cats { + found := false + for _, ct2 := range c.categories { + if ct.cat == ct2.cat { + if ct.negate != ct2.negate { + // oposite negations...this mean we just + // take us as anything and move on + c.makeAnything() + return + } + found = true + break + } + } + + if !found { + c.categories = append(c.categories, ct) + } + } +} + +// Merges new ranges to our own +func (c *CharSet) addRanges(ranges []singleRange) { + if c.anything { + return + } + c.ranges = append(c.ranges, ranges...) + c.canonicalize() +} + +// Merges everything but the new ranges into our own +func (c *CharSet) addNegativeRanges(ranges []singleRange) { + if c.anything { + return + } + + var hi rune + + // convert incoming ranges into opposites, assume they are in order + for _, r := range ranges { + if hi < r.first { + c.ranges = append(c.ranges, singleRange{hi, r.first - 1}) + } + hi = r.last + 1 + } + + if hi < utf8.MaxRune { + c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune}) + } + + c.canonicalize() +} + +func isValidUnicodeCat(catName string) bool { + _, ok := unicodeCategories[catName] + return ok +} + +func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) { + if !isValidUnicodeCat(categoryName) { + // unknown unicode category, script, or property "blah" + panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName)) + + } + + if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") { + // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match + c.addCategories( + category{cat: "Ll", negate: negate}, + category{cat: "Lu", negate: negate}, + category{cat: "Lt", negate: negate}) + } + c.addCategories(category{cat: categoryName, negate: negate}) +} + +func (c *CharSet) addSubtraction(sub *CharSet) { + c.sub = sub +} + +func (c *CharSet) addRange(chMin, chMax rune) { + c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax}) + c.canonicalize() +} + +func (c *CharSet) addNamedASCII(name string, negate bool) bool { + var rs []singleRange + + switch name { + case "alnum": + rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}} + case "alpha": + rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}} + case "ascii": + rs = []singleRange{singleRange{0, 0x7f}} + case "blank": + rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}} + case "cntrl": + rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}} + case "digit": + c.addDigit(false, negate, "") + case "graph": + rs = []singleRange{singleRange{'!', '~'}} + case "lower": + rs = []singleRange{singleRange{'a', 'z'}} + case "print": + rs = []singleRange{singleRange{' ', '~'}} + case "punct": //[!-/:-@[-`{-~] + rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}} + case "space": + c.addSpace(true, false, negate) + case "upper": + rs = []singleRange{singleRange{'A', 'Z'}} + case "word": + c.addWord(true, negate) + case "xdigit": + rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}} + default: + return false + } + + if len(rs) > 0 { + if negate { + c.addNegativeRanges(rs) + } else { + c.addRanges(rs) + } + } + + return true +} + +type singleRangeSorter []singleRange + +func (p singleRangeSorter) Len() int { return len(p) } +func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first } +func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// Logic to reduce a character class to a unique, sorted form. +func (c *CharSet) canonicalize() { + var i, j int + var last rune + + // + // Find and eliminate overlapping or abutting ranges + // + + if len(c.ranges) > 1 { + sort.Sort(singleRangeSorter(c.ranges)) + + done := false + + for i, j = 1, 0; ; i++ { + for last = c.ranges[j].last; ; i++ { + if i == len(c.ranges) || last == utf8.MaxRune { + done = true + break + } + + CurrentRange := c.ranges[i] + if CurrentRange.first > last+1 { + break + } + + if last < CurrentRange.last { + last = CurrentRange.last + } + } + + c.ranges[j] = singleRange{first: c.ranges[j].first, last: last} + + j++ + + if done { + break + } + + if j < i { + c.ranges[j] = c.ranges[i] + } + } + + c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...) + } +} + +// Adds to the class any lowercase versions of characters already +// in the class. Used for case-insensitivity. +func (c *CharSet) addLowercase() { + if c.anything { + return + } + toAdd := []singleRange{} + for i := 0; i < len(c.ranges); i++ { + r := c.ranges[i] + if r.first == r.last { + lower := unicode.ToLower(r.first) + c.ranges[i] = singleRange{first: lower, last: lower} + } else { + toAdd = append(toAdd, r) + } + } + + for _, r := range toAdd { + c.addLowercaseRange(r.first, r.last) + } + c.canonicalize() +} + +/************************************************************************** + Let U be the set of Unicode character values and let L be the lowercase + function, mapping from U to U. To perform case insensitive matching of + character sets, we need to be able to map an interval I in U, say + + I = [chMin, chMax] = { ch : chMin <= ch <= chMax } + + to a set A such that A contains L(I) and A is contained in the union of + I and L(I). + + The table below partitions U into intervals on which L is non-decreasing. + Thus, for any interval J = [a, b] contained in one of these intervals, + L(J) is contained in [L(a), L(b)]. + + It is also true that for any such J, [L(a), L(b)] is contained in the + union of J and L(J). This does not follow from L being non-decreasing on + these intervals. It follows from the nature of the L on each interval. + On each interval, L has one of the following forms: + + (1) L(ch) = constant (LowercaseSet) + (2) L(ch) = ch + offset (LowercaseAdd) + (3) L(ch) = ch | 1 (LowercaseBor) + (4) L(ch) = ch + (ch & 1) (LowercaseBad) + + It is easy to verify that for any of these forms [L(a), L(b)] is + contained in the union of [a, b] and L([a, b]). +***************************************************************************/ + +const ( + LowercaseSet = 0 // Set to arg. + LowercaseAdd = 1 // Add arg. + LowercaseBor = 2 // Bitwise or with 1. + LowercaseBad = 3 // Bitwise and with 1 and add original. +) + +type lcMap struct { + chMin, chMax rune + op, data int32 +} + +var lcTable = []lcMap{ + lcMap{'\u0041', '\u005A', LowercaseAdd, 32}, + lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32}, + lcMap{'\u0100', '\u012E', LowercaseBor, 0}, + lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069}, + lcMap{'\u0132', '\u0136', LowercaseBor, 0}, + lcMap{'\u0139', '\u0147', LowercaseBad, 0}, + lcMap{'\u014A', '\u0176', LowercaseBor, 0}, + lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF}, + lcMap{'\u0179', '\u017D', LowercaseBad, 0}, + lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253}, + lcMap{'\u0182', '\u0184', LowercaseBor, 0}, + lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254}, + lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188}, + lcMap{'\u0189', '\u018A', LowercaseAdd, 205}, + lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C}, + lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD}, + lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259}, + lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B}, + lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192}, + lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260}, + lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263}, + lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269}, + lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268}, + lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199}, + lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F}, + lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272}, + lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275}, + lcMap{'\u01A0', '\u01A4', LowercaseBor, 0}, + lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8}, + lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283}, + lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD}, + lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288}, + lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0}, + lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217}, + lcMap{'\u01B3', '\u01B5', LowercaseBad, 0}, + lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292}, + lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9}, + lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD}, + lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6}, + lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9}, + lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC}, + lcMap{'\u01CD', '\u01DB', LowercaseBad, 0}, + lcMap{'\u01DE', '\u01EE', LowercaseBor, 0}, + lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3}, + lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5}, + lcMap{'\u01FA', '\u0216', LowercaseBor, 0}, + lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC}, + lcMap{'\u0388', '\u038A', LowercaseAdd, 37}, + lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC}, + lcMap{'\u038E', '\u038F', LowercaseAdd, 63}, + lcMap{'\u0391', '\u03AB', LowercaseAdd, 32}, + lcMap{'\u03E2', '\u03EE', LowercaseBor, 0}, + lcMap{'\u0401', '\u040F', LowercaseAdd, 80}, + lcMap{'\u0410', '\u042F', LowercaseAdd, 32}, + lcMap{'\u0460', '\u0480', LowercaseBor, 0}, + lcMap{'\u0490', '\u04BE', LowercaseBor, 0}, + lcMap{'\u04C1', '\u04C3', LowercaseBad, 0}, + lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8}, + lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC}, + lcMap{'\u04D0', '\u04EA', LowercaseBor, 0}, + lcMap{'\u04EE', '\u04F4', LowercaseBor, 0}, + lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9}, + lcMap{'\u0531', '\u0556', LowercaseAdd, 48}, + lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48}, + lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0}, + lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8}, + lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8}, + lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8}, + lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8}, + lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8}, + lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51}, + lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53}, + lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55}, + lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57}, + lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8}, + lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8}, + lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8}, + lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8}, + lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8}, + lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74}, + lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3}, + lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86}, + lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3}, + lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8}, + lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100}, + lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8}, + lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112}, + lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5}, + lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128}, + lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126}, + lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3}, + lcMap{'\u2160', '\u216F', LowercaseAdd, 16}, + lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26}, + lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32}, +} + +func (c *CharSet) addLowercaseRange(chMin, chMax rune) { + var i, iMax, iMid int + var chMinT, chMaxT rune + var lc lcMap + + for i, iMax = 0, len(lcTable); i < iMax; { + iMid = (i + iMax) / 2 + if lcTable[iMid].chMax < chMin { + i = iMid + 1 + } else { + iMax = iMid + } + } + + for ; i < len(lcTable); i++ { + lc = lcTable[i] + if lc.chMin > chMax { + return + } + chMinT = lc.chMin + if chMinT < chMin { + chMinT = chMin + } + + chMaxT = lc.chMax + if chMaxT > chMax { + chMaxT = chMax + } + + switch lc.op { + case LowercaseSet: + chMinT = rune(lc.data) + chMaxT = rune(lc.data) + break + case LowercaseAdd: + chMinT += lc.data + chMaxT += lc.data + break + case LowercaseBor: + chMinT |= 1 + chMaxT |= 1 + break + case LowercaseBad: + chMinT += (chMinT & 1) + chMaxT += (chMaxT & 1) + break + } + + if chMinT < chMin || chMaxT > chMax { + c.addRange(chMinT, chMaxT) + } + } +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/code.go b/vendor/github.com/dlclark/regexp2/syntax/code.go new file mode 100644 index 0000000..686e822 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/code.go @@ -0,0 +1,274 @@ +package syntax + +import ( + "bytes" + "fmt" + "math" +) + +// similar to prog.go in the go regex package...also with comment 'may not belong in this package' + +// File provides operator constants for use by the Builder and the Machine. + +// Implementation notes: +// +// Regexps are built into RegexCodes, which contain an operation array, +// a string table, and some constants. +// +// Each operation is one of the codes below, followed by the integer +// operands specified for each op. +// +// Strings and sets are indices into a string table. + +type InstOp int + +const ( + // lef/back operands description + + Onerep InstOp = 0 // lef,back char,min,max a {n} + Notonerep = 1 // lef,back char,min,max .{n} + Setrep = 2 // lef,back set,min,max [\d]{n} + + Oneloop = 3 // lef,back char,min,max a {,n} + Notoneloop = 4 // lef,back char,min,max .{,n} + Setloop = 5 // lef,back set,min,max [\d]{,n} + + Onelazy = 6 // lef,back char,min,max a {,n}? + Notonelazy = 7 // lef,back char,min,max .{,n}? + Setlazy = 8 // lef,back set,min,max [\d]{,n}? + + One = 9 // lef char a + Notone = 10 // lef char [^a] + Set = 11 // lef set [a-z\s] \w \s \d + + Multi = 12 // lef string abcd + Ref = 13 // lef group \# + + Bol = 14 // ^ + Eol = 15 // $ + Boundary = 16 // \b + Nonboundary = 17 // \B + Beginning = 18 // \A + Start = 19 // \G + EndZ = 20 // \Z + End = 21 // \Z + + Nothing = 22 // Reject! + + // Primitive control structures + + Lazybranch = 23 // back jump straight first + Branchmark = 24 // back jump branch first for loop + Lazybranchmark = 25 // back jump straight first for loop + Nullcount = 26 // back val set counter, null mark + Setcount = 27 // back val set counter, make mark + Branchcount = 28 // back jump,limit branch++ if zero<=c<limit + Lazybranchcount = 29 // back jump,limit same, but straight first + Nullmark = 30 // back save position + Setmark = 31 // back save position + Capturemark = 32 // back group define group + Getmark = 33 // back recall position + Setjump = 34 // back save backtrack state + Backjump = 35 // zap back to saved state + Forejump = 36 // zap backtracking state + Testref = 37 // backtrack if ref undefined + Goto = 38 // jump just go + + Prune = 39 // prune it baby + Stop = 40 // done! + + ECMABoundary = 41 // \b + NonECMABoundary = 42 // \B + + // Modifiers for alternate modes + + Mask = 63 // Mask to get unmodified ordinary operator + Rtl = 64 // bit to indicate that we're reverse scanning. + Back = 128 // bit to indicate that we're backtracking. + Back2 = 256 // bit to indicate that we're backtracking on a second branch. + Ci = 512 // bit to indicate that we're case-insensitive. +) + +type Code struct { + Codes []int // the code + Strings [][]rune // string table + Sets []*CharSet //character set table + TrackCount int // how many instructions use backtracking + Caps map[int]int // mapping of user group numbers -> impl group slots + Capsize int // number of impl group slots + FcPrefix *Prefix // the set of candidate first characters (may be null) + BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null) + Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc) + RightToLeft bool // true if right to left +} + +func opcodeBacktracks(op InstOp) bool { + op &= Mask + + switch op { + case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark, + Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump, + Forejump, Goto: + return true + + default: + return false + } +} + +func opcodeSize(op InstOp) int { + op &= Mask + + switch op { + case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ, + End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop: + return 1 + + case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark, + Prune, Set: + return 2 + + case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, + Setlazy, Setrep, Setloop: + return 3 + + default: + panic(fmt.Errorf("Unexpected op code: %v", op)) + } +} + +var codeStr = []string{ + "Onerep", "Notonerep", "Setrep", + "Oneloop", "Notoneloop", "Setloop", + "Onelazy", "Notonelazy", "Setlazy", + "One", "Notone", "Set", + "Multi", "Ref", + "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End", + "Nothing", + "Lazybranch", "Branchmark", "Lazybranchmark", + "Nullcount", "Setcount", "Branchcount", "Lazybranchcount", + "Nullmark", "Setmark", "Capturemark", "Getmark", + "Setjump", "Backjump", "Forejump", "Testref", "Goto", + "Prune", "Stop", + "ECMABoundary", "NonECMABoundary", +} + +func operatorDescription(op InstOp) string { + desc := codeStr[op&Mask] + if (op & Ci) != 0 { + desc += "-Ci" + } + if (op & Rtl) != 0 { + desc += "-Rtl" + } + if (op & Back) != 0 { + desc += "-Back" + } + if (op & Back2) != 0 { + desc += "-Back2" + } + + return desc +} + +// OpcodeDescription is a humman readable string of the specific offset +func (c *Code) OpcodeDescription(offset int) string { + buf := &bytes.Buffer{} + + op := InstOp(c.Codes[offset]) + fmt.Fprintf(buf, "%06d ", offset) + + if opcodeBacktracks(op & Mask) { + buf.WriteString("*") + } else { + buf.WriteString(" ") + } + buf.WriteString(operatorDescription(op)) + buf.WriteString("(") + op &= Mask + + switch op { + case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy: + buf.WriteString("Ch = ") + buf.WriteString(CharDescription(rune(c.Codes[offset+1]))) + + case Set, Setrep, Setloop, Setlazy: + buf.WriteString("Set = ") + buf.WriteString(c.Sets[c.Codes[offset+1]].String()) + + case Multi: + fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]])) + + case Ref, Testref: + fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) + + case Capturemark: + fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) + if c.Codes[offset+2] != -1 { + fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2]) + } + + case Nullcount, Setcount: + fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1]) + + case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount: + fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1]) + } + + switch op { + case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy: + buf.WriteString(", Rep = ") + if c.Codes[offset+2] == math.MaxInt32 { + buf.WriteString("inf") + } else { + fmt.Fprintf(buf, "%d", c.Codes[offset+2]) + } + + case Branchcount, Lazybranchcount: + buf.WriteString(", Limit = ") + if c.Codes[offset+2] == math.MaxInt32 { + buf.WriteString("inf") + } else { + fmt.Fprintf(buf, "%d", c.Codes[offset+2]) + } + + } + + buf.WriteString(")") + + return buf.String() +} + +func (c *Code) Dump() string { + buf := &bytes.Buffer{} + + if c.RightToLeft { + fmt.Fprintln(buf, "Direction: right-to-left") + } else { + fmt.Fprintln(buf, "Direction: left-to-right") + } + if c.FcPrefix == nil { + fmt.Fprintln(buf, "Firstchars: n/a") + } else { + fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String()) + } + + if c.BmPrefix == nil { + fmt.Fprintln(buf, "Prefix: n/a") + } else { + fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String())) + } + + fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors) + fmt.Fprintln(buf) + + if c.BmPrefix != nil { + fmt.Fprintln(buf, "BoyerMoore:") + fmt.Fprintln(buf, c.BmPrefix.Dump(" ")) + } + for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) { + fmt.Fprintln(buf, c.OpcodeDescription(i)) + } + + return buf.String() +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/escape.go b/vendor/github.com/dlclark/regexp2/syntax/escape.go new file mode 100644 index 0000000..609df10 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/escape.go @@ -0,0 +1,94 @@ +package syntax + +import ( + "bytes" + "strconv" + "strings" + "unicode" +) + +func Escape(input string) string { + b := &bytes.Buffer{} + for _, r := range input { + escape(b, r, false) + } + return b.String() +} + +const meta = `\.+*?()|[]{}^$# ` + +func escape(b *bytes.Buffer, r rune, force bool) { + if unicode.IsPrint(r) { + if strings.IndexRune(meta, r) >= 0 || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + if r < 0x100 { + b.WriteString(`\x`) + s := strconv.FormatInt(int64(r), 16) + if len(s) == 1 { + b.WriteRune('0') + } + b.WriteString(s) + break + } + b.WriteString(`\u`) + b.WriteString(strconv.FormatInt(int64(r), 16)) + } +} + +func Unescape(input string) (string, error) { + idx := strings.IndexRune(input, '\\') + // no slashes means no unescape needed + if idx == -1 { + return input, nil + } + + buf := bytes.NewBufferString(input[:idx]) + // get the runes for the rest of the string -- we're going full parser scan on this + + p := parser{} + p.setPattern(input[idx+1:]) + for { + if p.rightMost() { + return "", p.getErr(ErrIllegalEndEscape) + } + r, err := p.scanCharEscape() + if err != nil { + return "", err + } + buf.WriteRune(r) + // are we done? + if p.rightMost() { + return buf.String(), nil + } + + r = p.moveRightGetChar() + for r != '\\' { + buf.WriteRune(r) + if p.rightMost() { + // we're done, no more slashes + return buf.String(), nil + } + // keep scanning until we get another slash + r = p.moveRightGetChar() + } + } +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/fuzz.go b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go new file mode 100644 index 0000000..ee86386 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go @@ -0,0 +1,20 @@ +// +build gofuzz + +package syntax + +// Fuzz is the input point for go-fuzz +func Fuzz(data []byte) int { + sdata := string(data) + tree, err := Parse(sdata, RegexOptions(0)) + if err != nil { + return 0 + } + + // translate it to code + _, err = Write(tree) + if err != nil { + panic(err) + } + + return 1 +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/parser.go b/vendor/github.com/dlclark/regexp2/syntax/parser.go new file mode 100644 index 0000000..9dc6e31 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/parser.go @@ -0,0 +1,2251 @@ +package syntax + +import ( + "fmt" + "math" + "os" + "sort" + "strconv" + "unicode" +) + +type RegexOptions int32 + +const ( + IgnoreCase RegexOptions = 0x0001 // "i" + Multiline = 0x0002 // "m" + ExplicitCapture = 0x0004 // "n" + Compiled = 0x0008 // "c" + Singleline = 0x0010 // "s" + IgnorePatternWhitespace = 0x0020 // "x" + RightToLeft = 0x0040 // "r" + Debug = 0x0080 // "d" + ECMAScript = 0x0100 // "e" + RE2 = 0x0200 // RE2 compat mode + Unicode = 0x0400 // "u" +) + +func optionFromCode(ch rune) RegexOptions { + // case-insensitive + switch ch { + case 'i', 'I': + return IgnoreCase + case 'r', 'R': + return RightToLeft + case 'm', 'M': + return Multiline + case 'n', 'N': + return ExplicitCapture + case 's', 'S': + return Singleline + case 'x', 'X': + return IgnorePatternWhitespace + case 'd', 'D': + return Debug + case 'e', 'E': + return ECMAScript + case 'u', 'U': + return Unicode + default: + return 0 + } +} + +// An Error describes a failure to parse a regular expression +// and gives the offending expression. +type Error struct { + Code ErrorCode + Expr string + Args []interface{} +} + +func (e *Error) Error() string { + if len(e.Args) == 0 { + return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`" + } + return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`" +} + +// An ErrorCode describes a failure to parse a regular expression. +type ErrorCode string + +const ( + // internal issue + ErrInternalError ErrorCode = "regexp/syntax: internal error" + // Parser errors + ErrUnterminatedComment = "unterminated comment" + ErrInvalidCharRange = "invalid character class range" + ErrInvalidRepeatSize = "invalid repeat count" + ErrInvalidUTF8 = "invalid UTF-8" + ErrCaptureGroupOutOfRange = "capture group number out of range" + ErrUnexpectedParen = "unexpected )" + ErrMissingParen = "missing closing )" + ErrMissingBrace = "missing closing }" + ErrInvalidRepeatOp = "invalid nested repetition operator" + ErrMissingRepeatArgument = "missing argument to repetition operator" + ErrConditionalExpression = "illegal conditional (?(...)) expression" + ErrTooManyAlternates = "too many | in (?()|)" + ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" + ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" + ErrCapNumNotZero = "capture number cannot be zero" + ErrUndefinedBackRef = "reference to undefined group number %v" + ErrUndefinedNameRef = "reference to undefined group name %v" + ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" + ErrAlternationCantHaveComment = "alternation conditions cannot be comments" + ErrMalformedReference = "(?(%v) ) malformed" + ErrUndefinedReference = "(?(%v) ) reference to undefined group" + ErrIllegalEndEscape = "illegal \\ at end of pattern" + ErrMalformedSlashP = "malformed \\p{X} character escape" + ErrIncompleteSlashP = "incomplete \\p{X} character escape" + ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" + ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" + ErrMissingControl = "missing control character" + ErrUnrecognizedControl = "unrecognized control character" + ErrTooFewHex = "insufficient hexadecimal digits" + ErrInvalidHex = "hex values may not be larger than 0x10FFFF" + ErrMalformedNameRef = "malformed \\k<...> named back reference" + ErrBadClassInCharRange = "cannot include class \\%v in character range" + ErrUnterminatedBracket = "unterminated [] set" + ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" + ErrReversedCharRange = "[%c-%c] range in reverse order" +) + +func (e ErrorCode) String() string { + return string(e) +} + +type parser struct { + stack *regexNode + group *regexNode + alternation *regexNode + concatenation *regexNode + unit *regexNode + + patternRaw string + pattern []rune + + currentPos int + specialCase *unicode.SpecialCase + + autocap int + capcount int + captop int + capsize int + + caps map[int]int + capnames map[string]int + + capnumlist []int + capnamelist []string + + options RegexOptions + optionsStack []RegexOptions + ignoreNextParen bool +} + +const ( + maxValueDiv10 int = math.MaxInt32 / 10 + maxValueMod10 = math.MaxInt32 % 10 +) + +// Parse converts a regex string into a parse tree +func Parse(re string, op RegexOptions) (*RegexTree, error) { + p := parser{ + options: op, + caps: make(map[int]int), + } + p.setPattern(re) + + if err := p.countCaptures(); err != nil { + return nil, err + } + + p.reset(op) + root, err := p.scanRegex() + + if err != nil { + return nil, err + } + tree := &RegexTree{ + root: root, + caps: p.caps, + capnumlist: p.capnumlist, + captop: p.captop, + Capnames: p.capnames, + Caplist: p.capnamelist, + options: op, + } + + if tree.options&Debug > 0 { + os.Stdout.WriteString(tree.Dump()) + } + + return tree, nil +} + +func (p *parser) setPattern(pattern string) { + p.patternRaw = pattern + p.pattern = make([]rune, 0, len(pattern)) + + //populate our rune array to handle utf8 encoding + for _, r := range pattern { + p.pattern = append(p.pattern, r) + } +} +func (p *parser) getErr(code ErrorCode, args ...interface{}) error { + return &Error{Code: code, Expr: p.patternRaw, Args: args} +} + +func (p *parser) noteCaptureSlot(i, pos int) { + if _, ok := p.caps[i]; !ok { + // the rhs of the hashtable isn't used in the parser + p.caps[i] = pos + p.capcount++ + + if p.captop <= i { + if i == math.MaxInt32 { + p.captop = i + } else { + p.captop = i + 1 + } + } + } +} + +func (p *parser) noteCaptureName(name string, pos int) { + if p.capnames == nil { + p.capnames = make(map[string]int) + } + + if _, ok := p.capnames[name]; !ok { + p.capnames[name] = pos + p.capnamelist = append(p.capnamelist, name) + } +} + +func (p *parser) assignNameSlots() { + if p.capnames != nil { + for _, name := range p.capnamelist { + for p.isCaptureSlot(p.autocap) { + p.autocap++ + } + pos := p.capnames[name] + p.capnames[name] = p.autocap + p.noteCaptureSlot(p.autocap, pos) + + p.autocap++ + } + } + + // if the caps array has at least one gap, construct the list of used slots + if p.capcount < p.captop { + p.capnumlist = make([]int, p.capcount) + i := 0 + + for k := range p.caps { + p.capnumlist[i] = k + i++ + } + + sort.Ints(p.capnumlist) + } + + // merge capsnumlist into capnamelist + if p.capnames != nil || p.capnumlist != nil { + var oldcapnamelist []string + var next int + var k int + + if p.capnames == nil { + oldcapnamelist = nil + p.capnames = make(map[string]int) + p.capnamelist = []string{} + next = -1 + } else { + oldcapnamelist = p.capnamelist + p.capnamelist = []string{} + next = p.capnames[oldcapnamelist[0]] + } + + for i := 0; i < p.capcount; i++ { + j := i + if p.capnumlist != nil { + j = p.capnumlist[i] + } + + if next == j { + p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) + k++ + + if k == len(oldcapnamelist) { + next = -1 + } else { + next = p.capnames[oldcapnamelist[k]] + } + + } else { + //feature: culture? + str := strconv.Itoa(j) + p.capnamelist = append(p.capnamelist, str) + p.capnames[str] = j + } + } + } +} + +func (p *parser) consumeAutocap() int { + r := p.autocap + p.autocap++ + return r +} + +// CountCaptures is a prescanner for deducing the slots used for +// captures by doing a partial tokenization of the pattern. +func (p *parser) countCaptures() error { + var ch rune + + p.noteCaptureSlot(0, 0) + + p.autocap = 1 + + for p.charsRight() > 0 { + pos := p.textpos() + ch = p.moveRightGetChar() + switch ch { + case '\\': + if p.charsRight() > 0 { + p.scanBackslash(true) + } + + case '#': + if p.useOptionX() { + p.moveLeft() + p.scanBlank() + } + + case '[': + p.scanCharSet(false, true) + + case ')': + if !p.emptyOptionsStack() { + p.popOptions() + } + + case '(': + if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' { + p.moveLeft() + p.scanBlank() + } else { + p.pushOptions() + if p.charsRight() > 0 && p.rightChar(0) == '?' { + // we have (?... + p.moveRight(1) + + if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') { + // named group: (?<... or (?'... + + p.moveRight(1) + ch = p.rightChar(0) + + if ch != '0' && IsWordChar(ch) { + if ch >= '1' && ch <= '9' { + dec, err := p.scanDecimal() + if err != nil { + return err + } + p.noteCaptureSlot(dec, pos) + } else { + p.noteCaptureName(p.scanCapname(), pos) + } + } + } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') { + // RE2-compat (?P<) + p.moveRight(2) + ch = p.rightChar(0) + if IsWordChar(ch) { + p.noteCaptureName(p.scanCapname(), pos) + } + + } else { + // (?... + + // get the options if it's an option construct (?cimsx-cimsx...) + p.scanOptions() + + if p.charsRight() > 0 { + if p.rightChar(0) == ')' { + // (?cimsx-cimsx) + p.moveRight(1) + p.popKeepOptions() + } else if p.rightChar(0) == '(' { + // alternation construct: (?(foo)yes|no) + // ignore the next paren so we don't capture the condition + p.ignoreNextParen = true + + // break from here so we don't reset ignoreNextParen + continue + } + } + } + } else { + if !p.useOptionN() && !p.ignoreNextParen { + p.noteCaptureSlot(p.consumeAutocap(), pos) + } + } + } + + p.ignoreNextParen = false + + } + } + + p.assignNameSlots() + return nil +} + +func (p *parser) reset(topopts RegexOptions) { + p.currentPos = 0 + p.autocap = 1 + p.ignoreNextParen = false + + if len(p.optionsStack) > 0 { + p.optionsStack = p.optionsStack[:0] + } + + p.options = topopts + p.stack = nil +} + +func (p *parser) scanRegex() (*regexNode, error) { + ch := '@' // nonspecial ch, means at beginning + isQuant := false + + p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1)) + + for p.charsRight() > 0 { + wasPrevQuantifier := isQuant + isQuant = false + + if err := p.scanBlank(); err != nil { + return nil, err + } + + startpos := p.textpos() + + // move past all of the normal characters. We'll stop when we hit some kind of control character, + // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. + if p.useOptionX() { + for p.charsRight() > 0 { + ch = p.rightChar(0) + //UGLY: clean up, this is ugly + if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) { + break + } + p.moveRight(1) + } + } else { + for p.charsRight() > 0 { + ch = p.rightChar(0) + if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) { + break + } + p.moveRight(1) + } + } + + endpos := p.textpos() + + p.scanBlank() + + if p.charsRight() == 0 { + ch = '!' // nonspecial, means at end + } else if ch = p.rightChar(0); isSpecial(ch) { + isQuant = isQuantifier(ch) + p.moveRight(1) + } else { + ch = ' ' // nonspecial, means at ordinary char + } + + if startpos < endpos { + cchUnquantified := endpos - startpos + if isQuant { + cchUnquantified-- + } + wasPrevQuantifier = false + + if cchUnquantified > 0 { + p.addToConcatenate(startpos, cchUnquantified, false) + } + + if isQuant { + p.addUnitOne(p.charAt(endpos - 1)) + } + } + + switch ch { + case '!': + goto BreakOuterScan + + case ' ': + goto ContinueOuterScan + + case '[': + cc, err := p.scanCharSet(p.useOptionI(), false) + if err != nil { + return nil, err + } + p.addUnitSet(cc) + + case '(': + p.pushOptions() + + if grouper, err := p.scanGroupOpen(); err != nil { + return nil, err + } else if grouper == nil { + p.popKeepOptions() + } else { + p.pushGroup() + p.startGroup(grouper) + } + + continue + + case '|': + p.addAlternate() + goto ContinueOuterScan + + case ')': + if p.emptyStack() { + return nil, p.getErr(ErrUnexpectedParen) + } + + if err := p.addGroup(); err != nil { + return nil, err + } + if err := p.popGroup(); err != nil { + return nil, err + } + p.popOptions() + + if p.unit == nil { + goto ContinueOuterScan + } + + case '\\': + n, err := p.scanBackslash(false) + if err != nil { + return nil, err + } + p.addUnitNode(n) + + case '^': + if p.useOptionM() { + p.addUnitType(ntBol) + } else { + p.addUnitType(ntBeginning) + } + + case '$': + if p.useOptionM() { + p.addUnitType(ntEol) + } else { + p.addUnitType(ntEndZ) + } + + case '.': + if p.useOptionE() { + p.addUnitSet(ECMAAnyClass()) + } else if p.useOptionS() { + p.addUnitSet(AnyClass()) + } else { + p.addUnitNotone('\n') + } + + case '{', '*', '+', '?': + if p.unit == nil { + if wasPrevQuantifier { + return nil, p.getErr(ErrInvalidRepeatOp) + } else { + return nil, p.getErr(ErrMissingRepeatArgument) + } + } + p.moveLeft() + + default: + return nil, p.getErr(ErrInternalError) + } + + if err := p.scanBlank(); err != nil { + return nil, err + } + + if p.charsRight() > 0 { + isQuant = p.isTrueQuantifier() + } + if p.charsRight() == 0 || !isQuant { + //maintain odd C# assignment order -- not sure if required, could clean up? + p.addConcatenate() + goto ContinueOuterScan + } + + ch = p.moveRightGetChar() + + // Handle quantifiers + for p.unit != nil { + var min, max int + var lazy bool + + switch ch { + case '*': + min = 0 + max = math.MaxInt32 + + case '?': + min = 0 + max = 1 + + case '+': + min = 1 + max = math.MaxInt32 + + case '{': + { + var err error + startpos = p.textpos() + if min, err = p.scanDecimal(); err != nil { + return nil, err + } + max = min + if startpos < p.textpos() { + if p.charsRight() > 0 && p.rightChar(0) == ',' { + p.moveRight(1) + if p.charsRight() == 0 || p.rightChar(0) == '}' { + max = math.MaxInt32 + } else { + if max, err = p.scanDecimal(); err != nil { + return nil, err + } + } + } + } + + if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' { + p.addConcatenate() + p.textto(startpos - 1) + goto ContinueOuterScan + } + } + + default: + return nil, p.getErr(ErrInternalError) + } + + if err := p.scanBlank(); err != nil { + return nil, err + } + + if p.charsRight() == 0 || p.rightChar(0) != '?' { + lazy = false + } else { + p.moveRight(1) + lazy = true + } + + if min > max { + return nil, p.getErr(ErrInvalidRepeatSize) + } + + p.addConcatenate3(lazy, min, max) + } + + ContinueOuterScan: + } + +BreakOuterScan: + ; + + if !p.emptyStack() { + return nil, p.getErr(ErrMissingParen) + } + + if err := p.addGroup(); err != nil { + return nil, err + } + + return p.unit, nil + +} + +/* + * Simple parsing for replacement patterns + */ +func (p *parser) scanReplacement() (*regexNode, error) { + var c, startpos int + + p.concatenation = newRegexNode(ntConcatenate, p.options) + + for { + c = p.charsRight() + if c == 0 { + break + } + + startpos = p.textpos() + + for c > 0 && p.rightChar(0) != '$' { + p.moveRight(1) + c-- + } + + p.addToConcatenate(startpos, p.textpos()-startpos, true) + + if c > 0 { + if p.moveRightGetChar() == '$' { + n, err := p.scanDollar() + if err != nil { + return nil, err + } + p.addUnitNode(n) + } + p.addConcatenate() + } + } + + return p.concatenation, nil +} + +/* + * Scans $ patterns recognized within replacement patterns + */ +func (p *parser) scanDollar() (*regexNode, error) { + if p.charsRight() == 0 { + return newRegexNodeCh(ntOne, p.options, '$'), nil + } + + ch := p.rightChar(0) + angled := false + backpos := p.textpos() + lastEndPos := backpos + + // Note angle + + if ch == '{' && p.charsRight() > 1 { + angled = true + p.moveRight(1) + ch = p.rightChar(0) + } + + // Try to parse backreference: \1 or \{1} or \{cap} + + if ch >= '0' && ch <= '9' { + if !angled && p.useOptionE() { + capnum := -1 + newcapnum := int(ch - '0') + p.moveRight(1) + if p.isCaptureSlot(newcapnum) { + capnum = newcapnum + lastEndPos = p.textpos() + } + + for p.charsRight() > 0 { + ch = p.rightChar(0) + if ch < '0' || ch > '9' { + break + } + digit := int(ch - '0') + if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) { + return nil, p.getErr(ErrCaptureGroupOutOfRange) + } + + newcapnum = newcapnum*10 + digit + + p.moveRight(1) + if p.isCaptureSlot(newcapnum) { + capnum = newcapnum + lastEndPos = p.textpos() + } + } + p.textto(lastEndPos) + if capnum >= 0 { + return newRegexNodeM(ntRef, p.options, capnum), nil + } + } else { + capnum, err := p.scanDecimal() + if err != nil { + return nil, err + } + if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' { + if p.isCaptureSlot(capnum) { + return newRegexNodeM(ntRef, p.options, capnum), nil + } + } + } + } else if angled && IsWordChar(ch) { + capname := p.scanCapname() + + if p.charsRight() > 0 && p.moveRightGetChar() == '}' { + if p.isCaptureName(capname) { + return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil + } + } + } else if !angled { + capnum := 1 + + switch ch { + case '$': + p.moveRight(1) + return newRegexNodeCh(ntOne, p.options, '$'), nil + case '&': + capnum = 0 + case '`': + capnum = replaceLeftPortion + case '\'': + capnum = replaceRightPortion + case '+': + capnum = replaceLastGroup + case '_': + capnum = replaceWholeString + } + + if capnum != 1 { + p.moveRight(1) + return newRegexNodeM(ntRef, p.options, capnum), nil + } + } + + // unrecognized $: literalize + + p.textto(backpos) + return newRegexNodeCh(ntOne, p.options, '$'), nil +} + +// scanGroupOpen scans chars following a '(' (not counting the '('), and returns +// a RegexNode for the type of group scanned, or nil if the group +// simply changed options (?cimsx-cimsx) or was a comment (#...). +func (p *parser) scanGroupOpen() (*regexNode, error) { + var ch rune + var nt nodeType + var err error + close := '>' + start := p.textpos() + + // just return a RegexNode if we have: + // 1. "(" followed by nothing + // 2. "(x" where x != ? + // 3. "(?)" + if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) { + if p.useOptionN() || p.ignoreNextParen { + p.ignoreNextParen = false + return newRegexNode(ntGroup, p.options), nil + } + return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil + } + + p.moveRight(1) + + for { + if p.charsRight() == 0 { + break + } + + switch ch = p.moveRightGetChar(); ch { + case ':': + nt = ntGroup + + case '=': + p.options &= ^RightToLeft + nt = ntRequire + + case '!': + p.options &= ^RightToLeft + nt = ntPrevent + + case '>': + nt = ntGreedy + + case '\'': + close = '\'' + fallthrough + + case '<': + if p.charsRight() == 0 { + goto BreakRecognize + } + + switch ch = p.moveRightGetChar(); ch { + case '=': + if close == '\'' { + goto BreakRecognize + } + + p.options |= RightToLeft + nt = ntRequire + + case '!': + if close == '\'' { + goto BreakRecognize + } + + p.options |= RightToLeft + nt = ntPrevent + + default: + p.moveLeft() + capnum := -1 + uncapnum := -1 + proceed := false + + // grab part before - + + if ch >= '0' && ch <= '9' { + if capnum, err = p.scanDecimal(); err != nil { + return nil, err + } + + if !p.isCaptureSlot(capnum) { + capnum = -1 + } + + // check if we have bogus characters after the number + if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { + return nil, p.getErr(ErrInvalidGroupName) + } + if capnum == 0 { + return nil, p.getErr(ErrCapNumNotZero) + } + } else if IsWordChar(ch) { + capname := p.scanCapname() + + if p.isCaptureName(capname) { + capnum = p.captureSlotFromName(capname) + } + + // check if we have bogus character after the name + if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { + return nil, p.getErr(ErrInvalidGroupName) + } + } else if ch == '-' { + proceed = true + } else { + // bad group name - starts with something other than a word character and isn't a number + return nil, p.getErr(ErrInvalidGroupName) + } + + // grab part after - if any + + if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { + p.moveRight(1) + + //no more chars left, no closing char, etc + if p.charsRight() == 0 { + return nil, p.getErr(ErrInvalidGroupName) + } + + ch = p.rightChar(0) + if ch >= '0' && ch <= '9' { + if uncapnum, err = p.scanDecimal(); err != nil { + return nil, err + } + + if !p.isCaptureSlot(uncapnum) { + return nil, p.getErr(ErrUndefinedBackRef, uncapnum) + } + + // check if we have bogus characters after the number + if p.charsRight() > 0 && p.rightChar(0) != close { + return nil, p.getErr(ErrInvalidGroupName) + } + } else if IsWordChar(ch) { + uncapname := p.scanCapname() + + if !p.isCaptureName(uncapname) { + return nil, p.getErr(ErrUndefinedNameRef, uncapname) + } + uncapnum = p.captureSlotFromName(uncapname) + + // check if we have bogus character after the name + if p.charsRight() > 0 && p.rightChar(0) != close { + return nil, p.getErr(ErrInvalidGroupName) + } + } else { + // bad group name - starts with something other than a word character and isn't a number + return nil, p.getErr(ErrInvalidGroupName) + } + } + + // actually make the node + + if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close { + return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil + } + goto BreakRecognize + } + + case '(': + // alternation construct (?(...) | ) + + parenPos := p.textpos() + if p.charsRight() > 0 { + ch = p.rightChar(0) + + // check if the alternation condition is a backref + if ch >= '0' && ch <= '9' { + var capnum int + if capnum, err = p.scanDecimal(); err != nil { + return nil, err + } + if p.charsRight() > 0 && p.moveRightGetChar() == ')' { + if p.isCaptureSlot(capnum) { + return newRegexNodeM(ntTestref, p.options, capnum), nil + } + return nil, p.getErr(ErrUndefinedReference, capnum) + } + + return nil, p.getErr(ErrMalformedReference, capnum) + + } else if IsWordChar(ch) { + capname := p.scanCapname() + + if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { + return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil + } + } + } + // not a backref + nt = ntTestgroup + p.textto(parenPos - 1) // jump to the start of the parentheses + p.ignoreNextParen = true // but make sure we don't try to capture the insides + + charsRight := p.charsRight() + if charsRight >= 3 && p.rightChar(1) == '?' { + rightchar2 := p.rightChar(2) + // disallow comments in the condition + if rightchar2 == '#' { + return nil, p.getErr(ErrAlternationCantHaveComment) + } + + // disallow named capture group (?<..>..) in the condition + if rightchar2 == '\'' { + return nil, p.getErr(ErrAlternationCantCapture) + } + + if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') { + return nil, p.getErr(ErrAlternationCantCapture) + } + } + + case 'P': + if p.useRE2() { + // support for P<name> syntax + if p.charsRight() < 3 { + goto BreakRecognize + } + + ch = p.moveRightGetChar() + if ch != '<' { + goto BreakRecognize + } + + ch = p.moveRightGetChar() + p.moveLeft() + + if IsWordChar(ch) { + capnum := -1 + capname := p.scanCapname() + + if p.isCaptureName(capname) { + capnum = p.captureSlotFromName(capname) + } + + // check if we have bogus character after the name + if p.charsRight() > 0 && p.rightChar(0) != '>' { + return nil, p.getErr(ErrInvalidGroupName) + } + + // actually make the node + + if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { + return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil + } + goto BreakRecognize + + } else { + // bad group name - starts with something other than a word character and isn't a number + return nil, p.getErr(ErrInvalidGroupName) + } + } + // if we're not using RE2 compat mode then + // we just behave like normal + fallthrough + + default: + p.moveLeft() + + nt = ntGroup + // disallow options in the children of a testgroup node + if p.group.t != ntTestgroup { + p.scanOptions() + } + if p.charsRight() == 0 { + goto BreakRecognize + } + + if ch = p.moveRightGetChar(); ch == ')' { + return nil, nil + } + + if ch != ':' { + goto BreakRecognize + } + + } + + return newRegexNode(nt, p.options), nil + } + +BreakRecognize: + + // break Recognize comes here + + return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()])) +} + +// scans backslash specials and basics +func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { + + if p.charsRight() == 0 { + return nil, p.getErr(ErrIllegalEndEscape) + } + + switch ch := p.rightChar(0); ch { + case 'b', 'B', 'A', 'G', 'Z', 'z': + p.moveRight(1) + return newRegexNode(p.typeFromCode(ch), p.options), nil + + case 'w': + p.moveRight(1) + if p.useOptionE() || p.useRE2() { + return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil + } + return newRegexNodeSet(ntSet, p.options, WordClass()), nil + + case 'W': + p.moveRight(1) + if p.useOptionE() || p.useRE2() { + return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil + } + return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil + + case 's': + p.moveRight(1) + if p.useOptionE() { + return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil + } else if p.useRE2() { + return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil + } + return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil + + case 'S': + p.moveRight(1) + if p.useOptionE() { + return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil + } else if p.useRE2() { + return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil + } + return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil + + case 'd': + p.moveRight(1) + if p.useOptionE() || p.useRE2() { + return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil + } + return newRegexNodeSet(ntSet, p.options, DigitClass()), nil + + case 'D': + p.moveRight(1) + if p.useOptionE() || p.useRE2() { + return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil + } + return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil + + case 'p', 'P': + p.moveRight(1) + prop, err := p.parseProperty() + if err != nil { + return nil, err + } + cc := &CharSet{} + cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) + if p.useOptionI() { + cc.addLowercase() + } + + return newRegexNodeSet(ntSet, p.options, cc), nil + + default: + return p.scanBasicBackslash(scanOnly) + } +} + +// Scans \-style backreferences and character escapes +func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { + if p.charsRight() == 0 { + return nil, p.getErr(ErrIllegalEndEscape) + } + angled := false + k := false + close := '\x00' + + backpos := p.textpos() + ch := p.rightChar(0) + + // Allow \k<foo> instead of \<foo>, which is now deprecated. + + // According to ECMAScript specification, \k<name> is only parsed as a named group reference if + // there is at least one group name in the regexp. + // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7. + // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok. + if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) { + if p.charsRight() >= 2 { + p.moveRight(1) + ch = p.moveRightGetChar() + + if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript + angled = true + if ch == '\'' { + close = '\'' + } else { + close = '>' + } + } + } + + if !angled || p.charsRight() <= 0 { + return nil, p.getErr(ErrMalformedNameRef) + } + + ch = p.rightChar(0) + k = true + + } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g + angled = true + if ch == '\'' { + close = '\'' + } else { + close = '>' + } + + p.moveRight(1) + ch = p.rightChar(0) + } + + // Try to parse backreference: \<1> or \<cap> + + if angled && ch >= '0' && ch <= '9' { + capnum, err := p.scanDecimal() + if err != nil { + return nil, err + } + + if p.charsRight() > 0 && p.moveRightGetChar() == close { + if p.isCaptureSlot(capnum) { + return newRegexNodeM(ntRef, p.options, capnum), nil + } + return nil, p.getErr(ErrUndefinedBackRef, capnum) + } + } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1 + capnum, err := p.scanDecimal() + if err != nil { + return nil, err + } + + if scanOnly { + return nil, nil + } + + if p.isCaptureSlot(capnum) { + return newRegexNodeM(ntRef, p.options, capnum), nil + } + if capnum <= 9 && !p.useOptionE() { + return nil, p.getErr(ErrUndefinedBackRef, capnum) + } + + } else if angled { + capname := p.scanCapname() + + if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close { + + if scanOnly { + return nil, nil + } + + if p.isCaptureName(capname) { + return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil + } + return nil, p.getErr(ErrUndefinedNameRef, capname) + } else { + if k { + return nil, p.getErr(ErrMalformedNameRef) + } + } + } + + // Not backreference: must be char code + + p.textto(backpos) + ch, err := p.scanCharEscape() + if err != nil { + return nil, err + } + + if scanOnly { + return nil, nil + } + + if p.useOptionI() { + ch = unicode.ToLower(ch) + } + + return newRegexNodeCh(ntOne, p.options, ch), nil +} + +// Scans X for \p{X} or \P{X} +func (p *parser) parseProperty() (string, error) { + if p.charsRight() < 3 { + return "", p.getErr(ErrIncompleteSlashP) + } + ch := p.moveRightGetChar() + if ch != '{' { + return "", p.getErr(ErrMalformedSlashP) + } + + startpos := p.textpos() + for p.charsRight() > 0 { + ch = p.moveRightGetChar() + if !(IsWordChar(ch) || ch == '-') { + p.moveLeft() + break + } + } + capname := string(p.pattern[startpos:p.textpos()]) + + if p.charsRight() == 0 || p.moveRightGetChar() != '}' { + return "", p.getErr(ErrIncompleteSlashP) + } + + if !isValidUnicodeCat(capname) { + return "", p.getErr(ErrUnknownSlashP, capname) + } + + return capname, nil +} + +// Returns ReNode type for zero-length assertions with a \ code. +func (p *parser) typeFromCode(ch rune) nodeType { + switch ch { + case 'b': + if p.useOptionE() { + return ntECMABoundary + } + return ntBoundary + case 'B': + if p.useOptionE() { + return ntNonECMABoundary + } + return ntNonboundary + case 'A': + return ntBeginning + case 'G': + return ntStart + case 'Z': + return ntEndZ + case 'z': + return ntEnd + default: + return ntNothing + } +} + +// Scans whitespace or x-mode comments. +func (p *parser) scanBlank() error { + if p.useOptionX() { + for { + for p.charsRight() > 0 && isSpace(p.rightChar(0)) { + p.moveRight(1) + } + + if p.charsRight() == 0 { + break + } + + if p.rightChar(0) == '#' { + for p.charsRight() > 0 && p.rightChar(0) != '\n' { + p.moveRight(1) + } + } else if p.charsRight() >= 3 && p.rightChar(2) == '#' && + p.rightChar(1) == '?' && p.rightChar(0) == '(' { + for p.charsRight() > 0 && p.rightChar(0) != ')' { + p.moveRight(1) + } + if p.charsRight() == 0 { + return p.getErr(ErrUnterminatedComment) + } + p.moveRight(1) + } else { + break + } + } + } else { + for { + if p.charsRight() < 3 || p.rightChar(2) != '#' || + p.rightChar(1) != '?' || p.rightChar(0) != '(' { + return nil + } + + for p.charsRight() > 0 && p.rightChar(0) != ')' { + p.moveRight(1) + } + if p.charsRight() == 0 { + return p.getErr(ErrUnterminatedComment) + } + p.moveRight(1) + } + } + return nil +} + +func (p *parser) scanCapname() string { + startpos := p.textpos() + + for p.charsRight() > 0 { + if !IsWordChar(p.moveRightGetChar()) { + p.moveLeft() + break + } + } + + return string(p.pattern[startpos:p.textpos()]) +} + +//Scans contents of [] (not including []'s), and converts to a set. +func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { + ch := '\x00' + chPrev := '\x00' + inRange := false + firstChar := true + closed := false + + var cc *CharSet + if !scanOnly { + cc = &CharSet{} + } + + if p.charsRight() > 0 && p.rightChar(0) == '^' { + p.moveRight(1) + if !scanOnly { + cc.negate = true + } + } + + for ; p.charsRight() > 0; firstChar = false { + fTranslatedChar := false + ch = p.moveRightGetChar() + if ch == ']' { + if !firstChar { + closed = true + break + } else if p.useOptionE() { + if !scanOnly { + cc.addRanges(NoneClass().ranges) + } + closed = true + break + } + + } else if ch == '\\' && p.charsRight() > 0 { + switch ch = p.moveRightGetChar(); ch { + case 'D', 'd': + if !scanOnly { + if inRange { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw) + } + continue + + case 'S', 's': + if !scanOnly { + if inRange { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S') + } + continue + + case 'W', 'w': + if !scanOnly { + if inRange { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + + cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W') + } + continue + + case 'p', 'P': + if !scanOnly { + if inRange { + return nil, p.getErr(ErrBadClassInCharRange, ch) + } + prop, err := p.parseProperty() + if err != nil { + return nil, err + } + cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) + } else { + p.parseProperty() + } + + continue + + case '-': + if !scanOnly { + cc.addRange(ch, ch) + } + continue + + default: + p.moveLeft() + var err error + ch, err = p.scanCharEscape() // non-literal character + if err != nil { + return nil, err + } + fTranslatedChar = true + break // this break will only break out of the switch + } + } else if ch == '[' { + // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. + // It currently doesn't do anything other than skip the whole thing! + if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange { + savePos := p.textpos() + + p.moveRight(1) + negate := false + if p.charsRight() > 1 && p.rightChar(0) == '^' { + negate = true + p.moveRight(1) + } + + nm := p.scanCapname() // snag the name + if !scanOnly && p.useRE2() { + // look up the name since these are valid for RE2 + // add the group based on the name + if ok := cc.addNamedASCII(nm, negate); !ok { + return nil, p.getErr(ErrInvalidCharRange) + } + } + if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' { + p.textto(savePos) + } else if p.useRE2() { + // move on + continue + } + } + } + + if inRange { + inRange = false + if !scanOnly { + if ch == '[' && !fTranslatedChar && !firstChar { + // We thought we were in a range, but we're actually starting a subtraction. + // In that case, we'll add chPrev to our char class, skip the opening [, and + // scan the new character class recursively. + cc.addChar(chPrev) + sub, err := p.scanCharSet(caseInsensitive, false) + if err != nil { + return nil, err + } + cc.addSubtraction(sub) + + if p.charsRight() > 0 && p.rightChar(0) != ']' { + return nil, p.getErr(ErrSubtractionMustBeLast) + } + } else { + // a regular range, like a-z + if chPrev > ch { + return nil, p.getErr(ErrReversedCharRange, chPrev, ch) + } + cc.addRange(chPrev, ch) + } + } + } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { + // this could be the start of a range + chPrev = ch + inRange = true + p.moveRight(1) + } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar { + // we aren't in a range, and now there is a subtraction. Usually this happens + // only when a subtraction follows a range, like [a-z-[b]] + if !scanOnly { + p.moveRight(1) + sub, err := p.scanCharSet(caseInsensitive, false) + if err != nil { + return nil, err + } + cc.addSubtraction(sub) + + if p.charsRight() > 0 && p.rightChar(0) != ']' { + return nil, p.getErr(ErrSubtractionMustBeLast) + } + } else { + p.moveRight(1) + p.scanCharSet(caseInsensitive, true) + } + } else { + if !scanOnly { + cc.addRange(ch, ch) + } + } + } + + if !closed { + return nil, p.getErr(ErrUnterminatedBracket) + } + + if !scanOnly && caseInsensitive { + cc.addLowercase() + } + + return cc, nil +} + +// Scans any number of decimal digits (pegs value at 2^31-1 if too large) +func (p *parser) scanDecimal() (int, error) { + i := 0 + var d int + + for p.charsRight() > 0 { + d = int(p.rightChar(0) - '0') + if d < 0 || d > 9 { + break + } + p.moveRight(1) + + if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) { + return 0, p.getErr(ErrCaptureGroupOutOfRange) + } + + i *= 10 + i += d + } + + return int(i), nil +} + +// Returns true for options allowed only at the top level +func isOnlyTopOption(option RegexOptions) bool { + return option == RightToLeft || option == ECMAScript || option == RE2 +} + +// Scans cimsx-cimsx option string, stops at the first unrecognized char. +func (p *parser) scanOptions() { + + for off := false; p.charsRight() > 0; p.moveRight(1) { + ch := p.rightChar(0) + + if ch == '-' { + off = true + } else if ch == '+' { + off = false + } else { + option := optionFromCode(ch) + if option == 0 || isOnlyTopOption(option) { + return + } + + if off { + p.options &= ^option + } else { + p.options |= option + } + } + } +} + +// Scans \ code for escape codes that map to single unicode chars. +func (p *parser) scanCharEscape() (r rune, err error) { + + ch := p.moveRightGetChar() + + if ch >= '0' && ch <= '7' { + p.moveLeft() + return p.scanOctal(), nil + } + + pos := p.textpos() + + switch ch { + case 'x': + // support for \x{HEX} syntax from Perl and PCRE + if p.charsRight() > 0 && p.rightChar(0) == '{' { + if p.useOptionE() { + return ch, nil + } + p.moveRight(1) + return p.scanHexUntilBrace() + } else { + r, err = p.scanHex(2) + } + case 'u': + // ECMAscript suppot \u{HEX} only if `u` is also set + if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' { + p.moveRight(1) + return p.scanHexUntilBrace() + } else { + r, err = p.scanHex(4) + } + case 'a': + return '\u0007', nil + case 'b': + return '\b', nil + case 'e': + return '\u001B', nil + case 'f': + return '\f', nil + case 'n': + return '\n', nil + case 'r': + return '\r', nil + case 't': + return '\t', nil + case 'v': + return '\u000B', nil + case 'c': + r, err = p.scanControl() + default: + if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) { + return 0, p.getErr(ErrUnrecognizedEscape, string(ch)) + } + return ch, nil + } + if err != nil && p.useOptionE() { + p.textto(pos) + return ch, nil + } + return +} + +// Grabs and converts an ascii control character +func (p *parser) scanControl() (rune, error) { + if p.charsRight() <= 0 { + return 0, p.getErr(ErrMissingControl) + } + + ch := p.moveRightGetChar() + + // \ca interpreted as \cA + + if ch >= 'a' && ch <= 'z' { + ch = (ch - ('a' - 'A')) + } + ch = (ch - '@') + if ch >= 0 && ch < ' ' { + return ch, nil + } + + return 0, p.getErr(ErrUnrecognizedControl) + +} + +// Scan hex digits until we hit a closing brace. +// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors +func (p *parser) scanHexUntilBrace() (rune, error) { + // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit + // so we can enforce that + i := 0 + hasContent := false + + for p.charsRight() > 0 { + ch := p.moveRightGetChar() + if ch == '}' { + // hit our close brace, we're done here + // prevent \x{} + if !hasContent { + return 0, p.getErr(ErrTooFewHex) + } + return rune(i), nil + } + hasContent = true + // no brace needs to be hex digit + d := hexDigit(ch) + if d < 0 { + return 0, p.getErr(ErrMissingBrace) + } + + i *= 0x10 + i += d + + if i > unicode.MaxRune { + return 0, p.getErr(ErrInvalidHex) + } + } + + // we only make it here if we run out of digits without finding the brace + return 0, p.getErr(ErrMissingBrace) +} + +// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) +func (p *parser) scanHex(c int) (rune, error) { + + i := 0 + + if p.charsRight() >= c { + for c > 0 { + d := hexDigit(p.moveRightGetChar()) + if d < 0 { + break + } + i *= 0x10 + i += d + c-- + } + } + + if c > 0 { + return 0, p.getErr(ErrTooFewHex) + } + + return rune(i), nil +} + +// Returns n <= 0xF for a hex digit. +func hexDigit(ch rune) int { + + if d := uint(ch - '0'); d <= 9 { + return int(d) + } + + if d := uint(ch - 'a'); d <= 5 { + return int(d + 0xa) + } + + if d := uint(ch - 'A'); d <= 5 { + return int(d + 0xa) + } + + return -1 +} + +// Scans up to three octal digits (stops before exceeding 0377). +func (p *parser) scanOctal() rune { + // Consume octal chars only up to 3 digits and value 0377 + + c := 3 + + if c > p.charsRight() { + c = p.charsRight() + } + + //we know the first char is good because the caller had to check + i := 0 + d := int(p.rightChar(0) - '0') + for c > 0 && d <= 7 && d >= 0 { + if i >= 0x20 && p.useOptionE() { + break + } + i *= 8 + i += d + c-- + + p.moveRight(1) + if !p.rightMost() { + d = int(p.rightChar(0) - '0') + } + } + + // Octal codes only go up to 255. Any larger and the behavior that Perl follows + // is simply to truncate the high bits. + i &= 0xFF + + return rune(i) +} + +// Returns the current parsing position. +func (p *parser) textpos() int { + return p.currentPos +} + +// Zaps to a specific parsing position. +func (p *parser) textto(pos int) { + p.currentPos = pos +} + +// Returns the char at the right of the current parsing position and advances to the right. +func (p *parser) moveRightGetChar() rune { + ch := p.pattern[p.currentPos] + p.currentPos++ + return ch +} + +// Moves the current position to the right. +func (p *parser) moveRight(i int) { + // default would be 1 + p.currentPos += i +} + +// Moves the current parsing position one to the left. +func (p *parser) moveLeft() { + p.currentPos-- +} + +// Returns the char left of the current parsing position. +func (p *parser) charAt(i int) rune { + return p.pattern[i] +} + +// Returns the char i chars right of the current parsing position. +func (p *parser) rightChar(i int) rune { + // default would be 0 + return p.pattern[p.currentPos+i] +} + +// Number of characters to the right of the current parsing position. +func (p *parser) charsRight() int { + return len(p.pattern) - p.currentPos +} + +func (p *parser) rightMost() bool { + return p.currentPos == len(p.pattern) +} + +// Looks up the slot number for a given name +func (p *parser) captureSlotFromName(capname string) int { + return p.capnames[capname] +} + +// True if the capture slot was noted +func (p *parser) isCaptureSlot(i int) bool { + if p.caps != nil { + _, ok := p.caps[i] + return ok + } + + return (i >= 0 && i < p.capsize) +} + +// Looks up the slot number for a given name +func (p *parser) isCaptureName(capname string) bool { + if p.capnames == nil { + return false + } + + _, ok := p.capnames[capname] + return ok +} + +// option shortcuts + +// True if N option disabling '(' autocapture is on. +func (p *parser) useOptionN() bool { + return (p.options & ExplicitCapture) != 0 +} + +// True if I option enabling case-insensitivity is on. +func (p *parser) useOptionI() bool { + return (p.options & IgnoreCase) != 0 +} + +// True if M option altering meaning of $ and ^ is on. +func (p *parser) useOptionM() bool { + return (p.options & Multiline) != 0 +} + +// True if S option altering meaning of . is on. +func (p *parser) useOptionS() bool { + return (p.options & Singleline) != 0 +} + +// True if X option enabling whitespace/comment mode is on. +func (p *parser) useOptionX() bool { + return (p.options & IgnorePatternWhitespace) != 0 +} + +// True if E option enabling ECMAScript behavior on. +func (p *parser) useOptionE() bool { + return (p.options & ECMAScript) != 0 +} + +// true to use RE2 compatibility parsing behavior. +func (p *parser) useRE2() bool { + return (p.options & RE2) != 0 +} + +// True if U option enabling ECMAScript's Unicode behavior on. +func (p *parser) useOptionU() bool { + return (p.options & Unicode) != 0 +} + +// True if options stack is empty. +func (p *parser) emptyOptionsStack() bool { + return len(p.optionsStack) == 0 +} + +// Finish the current quantifiable (when a quantifier is not found or is not possible) +func (p *parser) addConcatenate() { + // The first (| inside a Testgroup group goes directly to the group + p.concatenation.addChild(p.unit) + p.unit = nil +} + +// Finish the current quantifiable (when a quantifier is found) +func (p *parser) addConcatenate3(lazy bool, min, max int) { + p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max)) + p.unit = nil +} + +// Sets the current unit to a single char node +func (p *parser) addUnitOne(ch rune) { + if p.useOptionI() { + ch = unicode.ToLower(ch) + } + + p.unit = newRegexNodeCh(ntOne, p.options, ch) +} + +// Sets the current unit to a single inverse-char node +func (p *parser) addUnitNotone(ch rune) { + if p.useOptionI() { + ch = unicode.ToLower(ch) + } + + p.unit = newRegexNodeCh(ntNotone, p.options, ch) +} + +// Sets the current unit to a single set node +func (p *parser) addUnitSet(set *CharSet) { + p.unit = newRegexNodeSet(ntSet, p.options, set) +} + +// Sets the current unit to a subtree +func (p *parser) addUnitNode(node *regexNode) { + p.unit = node +} + +// Sets the current unit to an assertion of the specified type +func (p *parser) addUnitType(t nodeType) { + p.unit = newRegexNode(t, p.options) +} + +// Finish the current group (in response to a ')' or end) +func (p *parser) addGroup() error { + if p.group.t == ntTestgroup || p.group.t == ntTestref { + p.group.addChild(p.concatenation.reverseLeft()) + if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 { + return p.getErr(ErrTooManyAlternates) + } + } else { + p.alternation.addChild(p.concatenation.reverseLeft()) + p.group.addChild(p.alternation) + } + + p.unit = p.group + return nil +} + +// Pops the option stack, but keeps the current options unchanged. +func (p *parser) popKeepOptions() { + lastIdx := len(p.optionsStack) - 1 + p.optionsStack = p.optionsStack[:lastIdx] +} + +// Recalls options from the stack. +func (p *parser) popOptions() { + lastIdx := len(p.optionsStack) - 1 + // get the last item on the stack and then remove it by reslicing + p.options = p.optionsStack[lastIdx] + p.optionsStack = p.optionsStack[:lastIdx] +} + +// Saves options on a stack. +func (p *parser) pushOptions() { + p.optionsStack = append(p.optionsStack, p.options) +} + +// Add a string to the last concatenate. +func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) { + var node *regexNode + + if cch == 0 { + return + } + + if cch > 1 { + str := make([]rune, cch) + copy(str, p.pattern[pos:pos+cch]) + + if p.useOptionI() && !isReplacement { + // We do the ToLower character by character for consistency. With surrogate chars, doing + // a ToLower on the entire string could actually change the surrogate pair. This is more correct + // linguistically, but since Regex doesn't support surrogates, it's more important to be + // consistent. + for i := 0; i < len(str); i++ { + str[i] = unicode.ToLower(str[i]) + } + } + + node = newRegexNodeStr(ntMulti, p.options, str) + } else { + ch := p.charAt(pos) + + if p.useOptionI() && !isReplacement { + ch = unicode.ToLower(ch) + } + + node = newRegexNodeCh(ntOne, p.options, ch) + } + + p.concatenation.addChild(node) +} + +// Push the parser state (in response to an open paren) +func (p *parser) pushGroup() { + p.group.next = p.stack + p.alternation.next = p.group + p.concatenation.next = p.alternation + p.stack = p.concatenation +} + +// Remember the pushed state (in response to a ')') +func (p *parser) popGroup() error { + p.concatenation = p.stack + p.alternation = p.concatenation.next + p.group = p.alternation.next + p.stack = p.group.next + + // The first () inside a Testgroup group goes directly to the group + if p.group.t == ntTestgroup && len(p.group.children) == 0 { + if p.unit == nil { + return p.getErr(ErrConditionalExpression) + } + + p.group.addChild(p.unit) + p.unit = nil + } + return nil +} + +// True if the group stack is empty. +func (p *parser) emptyStack() bool { + return p.stack == nil +} + +// Start a new round for the parser state (in response to an open paren or string start) +func (p *parser) startGroup(openGroup *regexNode) { + p.group = openGroup + p.alternation = newRegexNode(ntAlternate, p.options) + p.concatenation = newRegexNode(ntConcatenate, p.options) +} + +// Finish the current concatenation (in response to a |) +func (p *parser) addAlternate() { + // The | parts inside a Testgroup group go directly to the group + + if p.group.t == ntTestgroup || p.group.t == ntTestref { + p.group.addChild(p.concatenation.reverseLeft()) + } else { + p.alternation.addChild(p.concatenation.reverseLeft()) + } + + p.concatenation = newRegexNode(ntConcatenate, p.options) +} + +// For categorizing ascii characters. + +const ( + Q byte = 5 // quantifier + S = 4 // ordinary stopper + Z = 3 // ScanBlank stopper + X = 2 // whitespace + E = 1 // should be escaped +) + +var _category = []byte{ + //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? + X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, + //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0, + //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0, +} + +func isSpace(ch rune) bool { + return (ch <= ' ' && _category[ch] == X) +} + +// Returns true for those characters that terminate a string of ordinary chars. +func isSpecial(ch rune) bool { + return (ch <= '|' && _category[ch] >= S) +} + +// Returns true for those characters that terminate a string of ordinary chars. +func isStopperX(ch rune) bool { + return (ch <= '|' && _category[ch] >= X) +} + +// Returns true for those characters that begin a quantifier. +func isQuantifier(ch rune) bool { + return (ch <= '{' && _category[ch] >= Q) +} + +func (p *parser) isTrueQuantifier() bool { + nChars := p.charsRight() + if nChars == 0 { + return false + } + + startpos := p.textpos() + ch := p.charAt(startpos) + if ch != '{' { + return ch <= '{' && _category[ch] >= Q + } + + //UGLY: this is ugly -- the original code was ugly too + pos := startpos + for { + nChars-- + if nChars <= 0 { + break + } + pos++ + ch = p.charAt(pos) + if ch < '0' || ch > '9' { + break + } + } + + if nChars == 0 || pos-startpos == 1 { + return false + } + if ch == '}' { + return true + } + if ch != ',' { + return false + } + for { + nChars-- + if nChars <= 0 { + break + } + pos++ + ch = p.charAt(pos) + if ch < '0' || ch > '9' { + break + } + } + + return nChars > 0 && ch == '}' +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/prefix.go b/vendor/github.com/dlclark/regexp2/syntax/prefix.go new file mode 100644 index 0000000..f671688 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/prefix.go @@ -0,0 +1,896 @@ +package syntax + +import ( + "bytes" + "fmt" + "strconv" + "unicode" + "unicode/utf8" +) + +type Prefix struct { + PrefixStr []rune + PrefixSet CharSet + CaseInsensitive bool +} + +// It takes a RegexTree and computes the set of chars that can start it. +func getFirstCharsPrefix(tree *RegexTree) *Prefix { + s := regexFcd{ + fcStack: make([]regexFc, 32), + intStack: make([]int, 32), + } + fc := s.regexFCFromRegexTree(tree) + + if fc == nil || fc.nullable || fc.cc.IsEmpty() { + return nil + } + fcSet := fc.getFirstChars() + return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive} +} + +type regexFcd struct { + intStack []int + intDepth int + fcStack []regexFc + fcDepth int + skipAllChildren bool // don't process any more children at the current level + skipchild bool // don't process the current child. + failed bool +} + +/* + * The main FC computation. It does a shortcutted depth-first walk + * through the tree and calls CalculateFC to emits code before + * and after each child of an interior node, and at each leaf. + */ +func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc { + curNode := tree.root + curChild := 0 + + for { + if len(curNode.children) == 0 { + // This is a leaf node + s.calculateFC(curNode.t, curNode, 0) + } else if curChild < len(curNode.children) && !s.skipAllChildren { + // This is an interior node, and we have more children to analyze + s.calculateFC(curNode.t|beforeChild, curNode, curChild) + + if !s.skipchild { + curNode = curNode.children[curChild] + // this stack is how we get a depth first walk of the tree. + s.pushInt(curChild) + curChild = 0 + } else { + curChild++ + s.skipchild = false + } + continue + } + + // This is an interior node where we've finished analyzing all the children, or + // the end of a leaf node. + s.skipAllChildren = false + + if s.intIsEmpty() { + break + } + + curChild = s.popInt() + curNode = curNode.next + + s.calculateFC(curNode.t|afterChild, curNode, curChild) + if s.failed { + return nil + } + + curChild++ + } + + if s.fcIsEmpty() { + return nil + } + + return s.popFC() +} + +// To avoid recursion, we use a simple integer stack. +// This is the push. +func (s *regexFcd) pushInt(I int) { + if s.intDepth >= len(s.intStack) { + expanded := make([]int, s.intDepth*2) + copy(expanded, s.intStack) + s.intStack = expanded + } + + s.intStack[s.intDepth] = I + s.intDepth++ +} + +// True if the stack is empty. +func (s *regexFcd) intIsEmpty() bool { + return s.intDepth == 0 +} + +// This is the pop. +func (s *regexFcd) popInt() int { + s.intDepth-- + return s.intStack[s.intDepth] +} + +// We also use a stack of RegexFC objects. +// This is the push. +func (s *regexFcd) pushFC(fc regexFc) { + if s.fcDepth >= len(s.fcStack) { + expanded := make([]regexFc, s.fcDepth*2) + copy(expanded, s.fcStack) + s.fcStack = expanded + } + + s.fcStack[s.fcDepth] = fc + s.fcDepth++ +} + +// True if the stack is empty. +func (s *regexFcd) fcIsEmpty() bool { + return s.fcDepth == 0 +} + +// This is the pop. +func (s *regexFcd) popFC() *regexFc { + s.fcDepth-- + return &s.fcStack[s.fcDepth] +} + +// This is the top. +func (s *regexFcd) topFC() *regexFc { + return &s.fcStack[s.fcDepth-1] +} + +// Called in Beforechild to prevent further processing of the current child +func (s *regexFcd) skipChild() { + s.skipchild = true +} + +// FC computation and shortcut cases for each node type +func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) { + //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description()) + ci := false + rtl := false + + if nt <= ntRef { + if (node.options & IgnoreCase) != 0 { + ci = true + } + if (node.options & RightToLeft) != 0 { + rtl = true + } + } + + switch nt { + case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild: + break + + case ntTestgroup | beforeChild: + if CurIndex == 0 { + s.skipChild() + } + break + + case ntEmpty: + s.pushFC(regexFc{nullable: true}) + break + + case ntConcatenate | afterChild: + if CurIndex != 0 { + child := s.popFC() + cumul := s.topFC() + + s.failed = !cumul.addFC(*child, true) + } + + fc := s.topFC() + if !fc.nullable { + s.skipAllChildren = true + } + break + + case ntTestgroup | afterChild: + if CurIndex > 1 { + child := s.popFC() + cumul := s.topFC() + + s.failed = !cumul.addFC(*child, false) + } + break + + case ntAlternate | afterChild, ntTestref | afterChild: + if CurIndex != 0 { + child := s.popFC() + cumul := s.topFC() + + s.failed = !cumul.addFC(*child, false) + } + break + + case ntLoop | afterChild, ntLazyloop | afterChild: + if node.m == 0 { + fc := s.topFC() + fc.nullable = true + } + break + + case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild: + break + + case ntRequire | beforeChild, ntPrevent | beforeChild: + s.skipChild() + s.pushFC(regexFc{nullable: true}) + break + + case ntRequire | afterChild, ntPrevent | afterChild: + break + + case ntOne, ntNotone: + s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci)) + break + + case ntOneloop, ntOnelazy: + s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci)) + break + + case ntNotoneloop, ntNotonelazy: + s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci)) + break + + case ntMulti: + if len(node.str) == 0 { + s.pushFC(regexFc{nullable: true}) + } else if !rtl { + s.pushFC(newRegexFc(node.str[0], false, false, ci)) + } else { + s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci)) + } + break + + case ntSet: + s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci}) + break + + case ntSetloop, ntSetlazy: + s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci}) + break + + case ntRef: + s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false}) + break + + case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: + s.pushFC(regexFc{nullable: true}) + break + + default: + panic(fmt.Sprintf("unexpected op code: %v", nt)) + } +} + +type regexFc struct { + cc CharSet + nullable bool + caseInsensitive bool +} + +func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc { + r := regexFc{ + caseInsensitive: caseInsensitive, + nullable: nullable, + } + if not { + if ch > 0 { + r.cc.addRange('\x00', ch-1) + } + if ch < 0xFFFF { + r.cc.addRange(ch+1, utf8.MaxRune) + } + } else { + r.cc.addRange(ch, ch) + } + return r +} + +func (r *regexFc) getFirstChars() CharSet { + if r.caseInsensitive { + r.cc.addLowercase() + } + + return r.cc +} + +func (r *regexFc) addFC(fc regexFc, concatenate bool) bool { + if !r.cc.IsMergeable() || !fc.cc.IsMergeable() { + return false + } + + if concatenate { + if !r.nullable { + return true + } + + if !fc.nullable { + r.nullable = false + } + } else { + if fc.nullable { + r.nullable = true + } + } + + r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive + r.cc.addSet(fc.cc) + + return true +} + +// This is a related computation: it takes a RegexTree and computes the +// leading substring if it sees one. It's quite trivial and gives up easily. +func getPrefix(tree *RegexTree) *Prefix { + var concatNode *regexNode + nextChild := 0 + + curNode := tree.root + + for { + switch curNode.t { + case ntConcatenate: + if len(curNode.children) > 0 { + concatNode = curNode + nextChild = 0 + } + + case ntGreedy, ntCapture: + curNode = curNode.children[0] + concatNode = nil + continue + + case ntOneloop, ntOnelazy: + if curNode.m > 0 { + return &Prefix{ + PrefixStr: repeat(curNode.ch, curNode.m), + CaseInsensitive: (curNode.options & IgnoreCase) != 0, + } + } + return nil + + case ntOne: + return &Prefix{ + PrefixStr: []rune{curNode.ch}, + CaseInsensitive: (curNode.options & IgnoreCase) != 0, + } + + case ntMulti: + return &Prefix{ + PrefixStr: curNode.str, + CaseInsensitive: (curNode.options & IgnoreCase) != 0, + } + + case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart, + ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent: + + default: + return nil + } + + if concatNode == nil || nextChild >= len(concatNode.children) { + return nil + } + + curNode = concatNode.children[nextChild] + nextChild++ + } +} + +// repeat the rune r, c times... up to the max of MaxPrefixSize +func repeat(r rune, c int) []rune { + if c > MaxPrefixSize { + c = MaxPrefixSize + } + + ret := make([]rune, c) + + // binary growth using copy for speed + ret[0] = r + bp := 1 + for bp < len(ret) { + copy(ret[bp:], ret[:bp]) + bp *= 2 + } + + return ret +} + +// BmPrefix precomputes the Boyer-Moore +// tables for fast string scanning. These tables allow +// you to scan for the first occurrence of a string within +// a large body of text without examining every character. +// The performance of the heuristic depends on the actual +// string and the text being searched, but usually, the longer +// the string that is being searched for, the fewer characters +// need to be examined. +type BmPrefix struct { + positive []int + negativeASCII []int + negativeUnicode [][]int + pattern []rune + lowASCII rune + highASCII rune + rightToLeft bool + caseInsensitive bool +} + +func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix { + + b := &BmPrefix{ + rightToLeft: rightToLeft, + caseInsensitive: caseInsensitive, + pattern: pattern, + } + + if caseInsensitive { + for i := 0; i < len(b.pattern); i++ { + // We do the ToLower character by character for consistency. With surrogate chars, doing + // a ToLower on the entire string could actually change the surrogate pair. This is more correct + // linguistically, but since Regex doesn't support surrogates, it's more important to be + // consistent. + + b.pattern[i] = unicode.ToLower(b.pattern[i]) + } + } + + var beforefirst, last, bump int + var scan, match int + + if !rightToLeft { + beforefirst = -1 + last = len(b.pattern) - 1 + bump = 1 + } else { + beforefirst = len(b.pattern) + last = 0 + bump = -1 + } + + // PART I - the good-suffix shift table + // + // compute the positive requirement: + // if char "i" is the first one from the right that doesn't match, + // then we know the matcher can advance by _positive[i]. + // + // This algorithm is a simplified variant of the standard + // Boyer-Moore good suffix calculation. + + b.positive = make([]int, len(b.pattern)) + + examine := last + ch := b.pattern[examine] + b.positive[examine] = bump + examine -= bump + +Outerloop: + for { + // find an internal char (examine) that matches the tail + + for { + if examine == beforefirst { + break Outerloop + } + if b.pattern[examine] == ch { + break + } + examine -= bump + } + + match = last + scan = examine + + // find the length of the match + for { + if scan == beforefirst || b.pattern[match] != b.pattern[scan] { + // at the end of the match, note the difference in _positive + // this is not the length of the match, but the distance from the internal match + // to the tail suffix. + if b.positive[match] == 0 { + b.positive[match] = match - scan + } + + // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); + + break + } + + scan -= bump + match -= bump + } + + examine -= bump + } + + match = last - bump + + // scan for the chars for which there are no shifts that yield a different candidate + + // The inside of the if statement used to say + // "_positive[match] = last - beforefirst;" + // This is slightly less aggressive in how much we skip, but at worst it + // should mean a little more work rather than skipping a potential match. + for match != beforefirst { + if b.positive[match] == 0 { + b.positive[match] = bump + } + + match -= bump + } + + // PART II - the bad-character shift table + // + // compute the negative requirement: + // if char "ch" is the reject character when testing position "i", + // we can slide up by _negative[ch]; + // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) + // + // the lookup table is divided into ASCII and Unicode portions; + // only those parts of the Unicode 16-bit code set that actually + // appear in the string are in the table. (Maximum size with + // Unicode is 65K; ASCII only case is 512 bytes.) + + b.negativeASCII = make([]int, 128) + + for i := 0; i < len(b.negativeASCII); i++ { + b.negativeASCII[i] = last - beforefirst + } + + b.lowASCII = 127 + b.highASCII = 0 + + for examine = last; examine != beforefirst; examine -= bump { + ch = b.pattern[examine] + + switch { + case ch < 128: + if b.lowASCII > ch { + b.lowASCII = ch + } + + if b.highASCII < ch { + b.highASCII = ch + } + + if b.negativeASCII[ch] == last-beforefirst { + b.negativeASCII[ch] = last - examine + } + case ch <= 0xffff: + i, j := ch>>8, ch&0xFF + + if b.negativeUnicode == nil { + b.negativeUnicode = make([][]int, 256) + } + + if b.negativeUnicode[i] == nil { + newarray := make([]int, 256) + + for k := 0; k < len(newarray); k++ { + newarray[k] = last - beforefirst + } + + if i == 0 { + copy(newarray, b.negativeASCII) + //TODO: this line needed? + b.negativeASCII = newarray + } + + b.negativeUnicode[i] = newarray + } + + if b.negativeUnicode[i][j] == last-beforefirst { + b.negativeUnicode[i][j] = last - examine + } + default: + // we can't do the filter because this algo doesn't support + // unicode chars >0xffff + return nil + } + } + + return b +} + +func (b *BmPrefix) String() string { + return string(b.pattern) +} + +// Dump returns the contents of the filter as a human readable string +func (b *BmPrefix) Dump(indent string) string { + buf := &bytes.Buffer{} + + fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent) + for i := 0; i < len(b.positive); i++ { + buf.WriteString(strconv.Itoa(b.positive[i])) + buf.WriteRune(' ') + } + buf.WriteRune('\n') + + if b.negativeASCII != nil { + buf.WriteString(indent) + buf.WriteString("Negative table\n") + for i := 0; i < len(b.negativeASCII); i++ { + if b.negativeASCII[i] != len(b.pattern) { + fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i])) + } + } + } + + return buf.String() +} + +// Scan uses the Boyer-Moore algorithm to find the first occurrence +// of the specified string within text, beginning at index, and +// constrained within beglimit and endlimit. +// +// The direction and case-sensitivity of the match is determined +// by the arguments to the RegexBoyerMoore constructor. +func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int { + var ( + defadv, test, test2 int + match, startmatch, endmatch int + bump, advance int + chTest rune + unicodeLookup []int + ) + + if !b.rightToLeft { + defadv = len(b.pattern) + startmatch = len(b.pattern) - 1 + endmatch = 0 + test = index + defadv - 1 + bump = 1 + } else { + defadv = -len(b.pattern) + startmatch = 0 + endmatch = -defadv - 1 + test = index + defadv + bump = -1 + } + + chMatch := b.pattern[startmatch] + + for { + if test >= endlimit || test < beglimit { + return -1 + } + + chTest = text[test] + + if b.caseInsensitive { + chTest = unicode.ToLower(chTest) + } + + if chTest != chMatch { + if chTest < 128 { + advance = b.negativeASCII[chTest] + } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { + unicodeLookup = b.negativeUnicode[chTest>>8] + if len(unicodeLookup) > 0 { + advance = unicodeLookup[chTest&0xFF] + } else { + advance = defadv + } + } else { + advance = defadv + } + + test += advance + } else { // if (chTest == chMatch) + test2 = test + match = startmatch + + for { + if match == endmatch { + if b.rightToLeft { + return test2 + 1 + } else { + return test2 + } + } + + match -= bump + test2 -= bump + + chTest = text[test2] + + if b.caseInsensitive { + chTest = unicode.ToLower(chTest) + } + + if chTest != b.pattern[match] { + advance = b.positive[match] + if chTest < 128 { + test2 = (match - startmatch) + b.negativeASCII[chTest] + } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { + unicodeLookup = b.negativeUnicode[chTest>>8] + if len(unicodeLookup) > 0 { + test2 = (match - startmatch) + unicodeLookup[chTest&0xFF] + } else { + test += advance + break + } + } else { + test += advance + break + } + + if b.rightToLeft { + if test2 < advance { + advance = test2 + } + } else if test2 > advance { + advance = test2 + } + + test += advance + break + } + } + } + } +} + +// When a regex is anchored, we can do a quick IsMatch test instead of a Scan +func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool { + if !b.rightToLeft { + if index < beglimit || endlimit-index < len(b.pattern) { + return false + } + + return b.matchPattern(text, index) + } else { + if index > endlimit || index-beglimit < len(b.pattern) { + return false + } + + return b.matchPattern(text, index-len(b.pattern)) + } +} + +func (b *BmPrefix) matchPattern(text []rune, index int) bool { + if len(text)-index < len(b.pattern) { + return false + } + + if b.caseInsensitive { + for i := 0; i < len(b.pattern); i++ { + //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); + if unicode.ToLower(text[index+i]) != b.pattern[i] { + return false + } + } + return true + } else { + for i := 0; i < len(b.pattern); i++ { + if text[index+i] != b.pattern[i] { + return false + } + } + return true + } +} + +type AnchorLoc int16 + +// where the regex can be pegged +const ( + AnchorBeginning AnchorLoc = 0x0001 + AnchorBol = 0x0002 + AnchorStart = 0x0004 + AnchorEol = 0x0008 + AnchorEndZ = 0x0010 + AnchorEnd = 0x0020 + AnchorBoundary = 0x0040 + AnchorECMABoundary = 0x0080 +) + +func getAnchors(tree *RegexTree) AnchorLoc { + + var concatNode *regexNode + nextChild, result := 0, AnchorLoc(0) + + curNode := tree.root + + for { + switch curNode.t { + case ntConcatenate: + if len(curNode.children) > 0 { + concatNode = curNode + nextChild = 0 + } + + case ntGreedy, ntCapture: + curNode = curNode.children[0] + concatNode = nil + continue + + case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, + ntStart, ntEndZ, ntEnd: + return result | anchorFromType(curNode.t) + + case ntEmpty, ntRequire, ntPrevent: + + default: + return result + } + + if concatNode == nil || nextChild >= len(concatNode.children) { + return result + } + + curNode = concatNode.children[nextChild] + nextChild++ + } +} + +func anchorFromType(t nodeType) AnchorLoc { + switch t { + case ntBol: + return AnchorBol + case ntEol: + return AnchorEol + case ntBoundary: + return AnchorBoundary + case ntECMABoundary: + return AnchorECMABoundary + case ntBeginning: + return AnchorBeginning + case ntStart: + return AnchorStart + case ntEndZ: + return AnchorEndZ + case ntEnd: + return AnchorEnd + default: + return 0 + } +} + +// anchorDescription returns a human-readable description of the anchors +func (anchors AnchorLoc) String() string { + buf := &bytes.Buffer{} + + if 0 != (anchors & AnchorBeginning) { + buf.WriteString(", Beginning") + } + if 0 != (anchors & AnchorStart) { + buf.WriteString(", Start") + } + if 0 != (anchors & AnchorBol) { + buf.WriteString(", Bol") + } + if 0 != (anchors & AnchorBoundary) { + buf.WriteString(", Boundary") + } + if 0 != (anchors & AnchorECMABoundary) { + buf.WriteString(", ECMABoundary") + } + if 0 != (anchors & AnchorEol) { + buf.WriteString(", Eol") + } + if 0 != (anchors & AnchorEnd) { + buf.WriteString(", End") + } + if 0 != (anchors & AnchorEndZ) { + buf.WriteString(", EndZ") + } + + // trim off comma + if buf.Len() >= 2 { + return buf.String()[2:] + } + return "None" +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go new file mode 100644 index 0000000..bcf4d3f --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go @@ -0,0 +1,87 @@ +package syntax + +import ( + "bytes" + "errors" +) + +type ReplacerData struct { + Rep string + Strings []string + Rules []int +} + +const ( + replaceSpecials = 4 + replaceLeftPortion = -1 + replaceRightPortion = -2 + replaceLastGroup = -3 + replaceWholeString = -4 +) + +//ErrReplacementError is a general error during parsing the replacement text +var ErrReplacementError = errors.New("Replacement pattern error.") + +// NewReplacerData will populate a reusable replacer data struct based on the given replacement string +// and the capture group data from a regexp +func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) { + p := parser{ + options: op, + caps: caps, + capsize: capsize, + capnames: capnames, + } + p.setPattern(rep) + concat, err := p.scanReplacement() + if err != nil { + return nil, err + } + + if concat.t != ntConcatenate { + panic(ErrReplacementError) + } + + sb := &bytes.Buffer{} + var ( + strings []string + rules []int + ) + + for _, child := range concat.children { + switch child.t { + case ntMulti: + child.writeStrToBuf(sb) + + case ntOne: + sb.WriteRune(child.ch) + + case ntRef: + if sb.Len() > 0 { + rules = append(rules, len(strings)) + strings = append(strings, sb.String()) + sb.Reset() + } + slot := child.m + + if len(caps) > 0 && slot >= 0 { + slot = caps[slot] + } + + rules = append(rules, -replaceSpecials-1-slot) + + default: + panic(ErrReplacementError) + } + } + + if sb.Len() > 0 { + rules = append(rules, len(strings)) + strings = append(strings, sb.String()) + } + + return &ReplacerData{ + Rep: rep, + Strings: strings, + Rules: rules, + }, nil +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/tree.go b/vendor/github.com/dlclark/regexp2/syntax/tree.go new file mode 100644 index 0000000..ea28829 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/tree.go @@ -0,0 +1,654 @@ +package syntax + +import ( + "bytes" + "fmt" + "math" + "strconv" +) + +type RegexTree struct { + root *regexNode + caps map[int]int + capnumlist []int + captop int + Capnames map[string]int + Caplist []string + options RegexOptions +} + +// It is built into a parsed tree for a regular expression. + +// Implementation notes: +// +// Since the node tree is a temporary data structure only used +// during compilation of the regexp to integer codes, it's +// designed for clarity and convenience rather than +// space efficiency. +// +// RegexNodes are built into a tree, linked by the n.children list. +// Each node also has a n.parent and n.ichild member indicating +// its parent and which child # it is in its parent's list. +// +// RegexNodes come in as many types as there are constructs in +// a regular expression, for example, "concatenate", "alternate", +// "one", "rept", "group". There are also node types for basic +// peephole optimizations, e.g., "onerep", "notsetrep", etc. +// +// Because perl 5 allows "lookback" groups that scan backwards, +// each node also gets a "direction". Normally the value of +// boolean n.backward = false. +// +// During parsing, top-level nodes are also stacked onto a parse +// stack (a stack of trees). For this purpose we have a n.next +// pointer. [Note that to save a few bytes, we could overload the +// n.parent pointer instead.] +// +// On the parse stack, each tree has a "role" - basically, the +// nonterminal in the grammar that the parser has currently +// assigned to the tree. That code is stored in n.role. +// +// Finally, some of the different kinds of nodes have data. +// Two integers (for the looping constructs) are stored in +// n.operands, an an object (either a string or a set) +// is stored in n.data +type regexNode struct { + t nodeType + children []*regexNode + str []rune + set *CharSet + ch rune + m int + n int + options RegexOptions + next *regexNode +} + +type nodeType int32 + +const ( + // The following are leaves, and correspond to primitive operations + + ntOnerep nodeType = 0 // lef,back char,min,max a {n} + ntNotonerep = 1 // lef,back char,min,max .{n} + ntSetrep = 2 // lef,back set,min,max [\d]{n} + ntOneloop = 3 // lef,back char,min,max a {,n} + ntNotoneloop = 4 // lef,back char,min,max .{,n} + ntSetloop = 5 // lef,back set,min,max [\d]{,n} + ntOnelazy = 6 // lef,back char,min,max a {,n}? + ntNotonelazy = 7 // lef,back char,min,max .{,n}? + ntSetlazy = 8 // lef,back set,min,max [\d]{,n}? + ntOne = 9 // lef char a + ntNotone = 10 // lef char [^a] + ntSet = 11 // lef set [a-z\s] \w \s \d + ntMulti = 12 // lef string abcd + ntRef = 13 // lef group \# + ntBol = 14 // ^ + ntEol = 15 // $ + ntBoundary = 16 // \b + ntNonboundary = 17 // \B + ntBeginning = 18 // \A + ntStart = 19 // \G + ntEndZ = 20 // \Z + ntEnd = 21 // \Z + + // Interior nodes do not correspond to primitive operations, but + // control structures compositing other operations + + // Concat and alternate take n children, and can run forward or backwards + + ntNothing = 22 // [] + ntEmpty = 23 // () + ntAlternate = 24 // a|b + ntConcatenate = 25 // ab + ntLoop = 26 // m,x * + ? {,} + ntLazyloop = 27 // m,x *? +? ?? {,}? + ntCapture = 28 // n () + ntGroup = 29 // (?:) + ntRequire = 30 // (?=) (?<=) + ntPrevent = 31 // (?!) (?<!) + ntGreedy = 32 // (?>) (?<) + ntTestref = 33 // (?(n) | ) + ntTestgroup = 34 // (?(...) | ) + + ntECMABoundary = 41 // \b + ntNonECMABoundary = 42 // \B +) + +func newRegexNode(t nodeType, opt RegexOptions) *regexNode { + return ®exNode{ + t: t, + options: opt, + } +} + +func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode { + return ®exNode{ + t: t, + options: opt, + ch: ch, + } +} + +func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode { + return ®exNode{ + t: t, + options: opt, + str: str, + } +} + +func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode { + return ®exNode{ + t: t, + options: opt, + set: set, + } +} + +func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode { + return ®exNode{ + t: t, + options: opt, + m: m, + } +} +func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode { + return ®exNode{ + t: t, + options: opt, + m: m, + n: n, + } +} + +func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) { + for i := 0; i < len(n.str); i++ { + buf.WriteRune(n.str[i]) + } +} + +func (n *regexNode) addChild(child *regexNode) { + reduced := child.reduce() + n.children = append(n.children, reduced) + reduced.next = n +} + +func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) { + newChildren := make([]*regexNode, 0, len(n.children)+len(nodes)) + n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...) +} + +// removes children including the start but not the end index +func (n *regexNode) removeChildren(startIndex, endIndex int) { + n.children = append(n.children[:startIndex], n.children[endIndex:]...) +} + +// Pass type as OneLazy or OneLoop +func (n *regexNode) makeRep(t nodeType, min, max int) { + n.t += (t - ntOne) + n.m = min + n.n = max +} + +func (n *regexNode) reduce() *regexNode { + switch n.t { + case ntAlternate: + return n.reduceAlternation() + + case ntConcatenate: + return n.reduceConcatenation() + + case ntLoop, ntLazyloop: + return n.reduceRep() + + case ntGroup: + return n.reduceGroup() + + case ntSet, ntSetloop: + return n.reduceSet() + + default: + return n + } +} + +// Basic optimization. Single-letter alternations can be replaced +// by faster set specifications, and nested alternations with no +// intervening operators can be flattened: +// +// a|b|c|def|g|h -> [a-c]|def|[gh] +// apple|(?:orange|pear)|grape -> apple|orange|pear|grape +func (n *regexNode) reduceAlternation() *regexNode { + if len(n.children) == 0 { + return newRegexNode(ntNothing, n.options) + } + + wasLastSet := false + lastNodeCannotMerge := false + var optionsLast RegexOptions + var i, j int + + for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { + at := n.children[i] + + if j < i { + n.children[j] = at + } + + for { + if at.t == ntAlternate { + for k := 0; k < len(at.children); k++ { + at.children[k].next = n + } + n.insertChildren(i+1, at.children) + + j-- + } else if at.t == ntSet || at.t == ntOne { + // Cannot merge sets if L or I options differ, or if either are negated. + optionsAt := at.options & (RightToLeft | IgnoreCase) + + if at.t == ntSet { + if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() { + wasLastSet = true + lastNodeCannotMerge = !at.set.IsMergeable() + optionsLast = optionsAt + break + } + } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge { + wasLastSet = true + lastNodeCannotMerge = false + optionsLast = optionsAt + break + } + + // The last node was a Set or a One, we're a Set or One and our options are the same. + // Merge the two nodes. + j-- + prev := n.children[j] + + var prevCharClass *CharSet + if prev.t == ntOne { + prevCharClass = &CharSet{} + prevCharClass.addChar(prev.ch) + } else { + prevCharClass = prev.set + } + + if at.t == ntOne { + prevCharClass.addChar(at.ch) + } else { + prevCharClass.addSet(*at.set) + } + + prev.t = ntSet + prev.set = prevCharClass + } else if at.t == ntNothing { + j-- + } else { + wasLastSet = false + lastNodeCannotMerge = false + } + break + } + } + + if j < i { + n.removeChildren(j, i) + } + + return n.stripEnation(ntNothing) +} + +// Basic optimization. Adjacent strings can be concatenated. +// +// (?:abc)(?:def) -> abcdef +func (n *regexNode) reduceConcatenation() *regexNode { + // Eliminate empties and concat adjacent strings/chars + + var optionsLast RegexOptions + var optionsAt RegexOptions + var i, j int + + if len(n.children) == 0 { + return newRegexNode(ntEmpty, n.options) + } + + wasLastString := false + + for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { + var at, prev *regexNode + + at = n.children[i] + + if j < i { + n.children[j] = at + } + + if at.t == ntConcatenate && + ((at.options & RightToLeft) == (n.options & RightToLeft)) { + for k := 0; k < len(at.children); k++ { + at.children[k].next = n + } + + //insert at.children at i+1 index in n.children + n.insertChildren(i+1, at.children) + + j-- + } else if at.t == ntMulti || at.t == ntOne { + // Cannot merge strings if L or I options differ + optionsAt = at.options & (RightToLeft | IgnoreCase) + + if !wasLastString || optionsLast != optionsAt { + wasLastString = true + optionsLast = optionsAt + continue + } + + j-- + prev = n.children[j] + + if prev.t == ntOne { + prev.t = ntMulti + prev.str = []rune{prev.ch} + } + + if (optionsAt & RightToLeft) == 0 { + if at.t == ntOne { + prev.str = append(prev.str, at.ch) + } else { + prev.str = append(prev.str, at.str...) + } + } else { + if at.t == ntOne { + // insert at the front by expanding our slice, copying the data over, and then setting the value + prev.str = append(prev.str, 0) + copy(prev.str[1:], prev.str) + prev.str[0] = at.ch + } else { + //insert at the front...this one we'll make a new slice and copy both into it + merge := make([]rune, len(prev.str)+len(at.str)) + copy(merge, at.str) + copy(merge[len(at.str):], prev.str) + prev.str = merge + } + } + } else if at.t == ntEmpty { + j-- + } else { + wasLastString = false + } + } + + if j < i { + // remove indices j through i from the children + n.removeChildren(j, i) + } + + return n.stripEnation(ntEmpty) +} + +// Nested repeaters just get multiplied with each other if they're not +// too lumpy +func (n *regexNode) reduceRep() *regexNode { + + u := n + t := n.t + min := n.m + max := n.n + + for { + if len(u.children) == 0 { + break + } + + child := u.children[0] + + // multiply reps of the same type only + if child.t != t { + childType := child.t + + if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop || + childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) { + break + } + } + + // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})? + // [but things like (a {2,})+ are not too lumpy...] + if u.m == 0 && child.m > 1 || child.n < child.m*2 { + break + } + + u = child + if u.m > 0 { + if (math.MaxInt32-1)/u.m < min { + u.m = math.MaxInt32 + } else { + u.m = u.m * min + } + } + if u.n > 0 { + if (math.MaxInt32-1)/u.n < max { + u.n = math.MaxInt32 + } else { + u.n = u.n * max + } + } + } + + if math.MaxInt32 == min { + return newRegexNode(ntNothing, n.options) + } + return u + +} + +// Simple optimization. If a concatenation or alternation has only +// one child strip out the intermediate node. If it has zero children, +// turn it into an empty. +func (n *regexNode) stripEnation(emptyType nodeType) *regexNode { + switch len(n.children) { + case 0: + return newRegexNode(emptyType, n.options) + case 1: + return n.children[0] + default: + return n + } +} + +func (n *regexNode) reduceGroup() *regexNode { + u := n + + for u.t == ntGroup { + u = u.children[0] + } + + return u +} + +// Simple optimization. If a set is a singleton, an inverse singleton, +// or empty, it's transformed accordingly. +func (n *regexNode) reduceSet() *regexNode { + // Extract empty-set, one and not-one case as special + + if n.set == nil { + n.t = ntNothing + } else if n.set.IsSingleton() { + n.ch = n.set.SingletonChar() + n.set = nil + n.t += (ntOne - ntSet) + } else if n.set.IsSingletonInverse() { + n.ch = n.set.SingletonChar() + n.set = nil + n.t += (ntNotone - ntSet) + } + + return n +} + +func (n *regexNode) reverseLeft() *regexNode { + if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 { + //reverse children order + for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 { + n.children[left], n.children[right] = n.children[right], n.children[left] + } + } + + return n +} + +func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode { + if min == 0 && max == 0 { + return newRegexNode(ntEmpty, n.options) + } + + if min == 1 && max == 1 { + return n + } + + switch n.t { + case ntOne, ntNotone, ntSet: + if lazy { + n.makeRep(Onelazy, min, max) + } else { + n.makeRep(Oneloop, min, max) + } + return n + + default: + var t nodeType + if lazy { + t = ntLazyloop + } else { + t = ntLoop + } + result := newRegexNodeMN(t, n.options, min, max) + result.addChild(n) + return result + } +} + +// debug functions + +var typeStr = []string{ + "Onerep", "Notonerep", "Setrep", + "Oneloop", "Notoneloop", "Setloop", + "Onelazy", "Notonelazy", "Setlazy", + "One", "Notone", "Set", + "Multi", "Ref", + "Bol", "Eol", "Boundary", "Nonboundary", + "Beginning", "Start", "EndZ", "End", + "Nothing", "Empty", + "Alternate", "Concatenate", + "Loop", "Lazyloop", + "Capture", "Group", "Require", "Prevent", "Greedy", + "Testref", "Testgroup", + "Unknown", "Unknown", "Unknown", + "Unknown", "Unknown", "Unknown", + "ECMABoundary", "NonECMABoundary", +} + +func (n *regexNode) description() string { + buf := &bytes.Buffer{} + + buf.WriteString(typeStr[n.t]) + + if (n.options & ExplicitCapture) != 0 { + buf.WriteString("-C") + } + if (n.options & IgnoreCase) != 0 { + buf.WriteString("-I") + } + if (n.options & RightToLeft) != 0 { + buf.WriteString("-L") + } + if (n.options & Multiline) != 0 { + buf.WriteString("-M") + } + if (n.options & Singleline) != 0 { + buf.WriteString("-S") + } + if (n.options & IgnorePatternWhitespace) != 0 { + buf.WriteString("-X") + } + if (n.options & ECMAScript) != 0 { + buf.WriteString("-E") + } + + switch n.t { + case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone: + buf.WriteString("(Ch = " + CharDescription(n.ch) + ")") + break + case ntCapture: + buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")") + break + case ntRef, ntTestref: + buf.WriteString("(index = " + strconv.Itoa(n.m) + ")") + break + case ntMulti: + fmt.Fprintf(buf, "(String = %s)", string(n.str)) + break + case ntSet, ntSetloop, ntSetlazy: + buf.WriteString("(Set = " + n.set.String() + ")") + break + } + + switch n.t { + case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop: + buf.WriteString("(Min = ") + buf.WriteString(strconv.Itoa(n.m)) + buf.WriteString(", Max = ") + if n.n == math.MaxInt32 { + buf.WriteString("inf") + } else { + buf.WriteString(strconv.Itoa(n.n)) + } + buf.WriteString(")") + + break + } + + return buf.String() +} + +var padSpace = []byte(" ") + +func (t *RegexTree) Dump() string { + return t.root.dump() +} + +func (n *regexNode) dump() string { + var stack []int + CurNode := n + CurChild := 0 + + buf := bytes.NewBufferString(CurNode.description()) + buf.WriteRune('\n') + + for { + if CurNode.children != nil && CurChild < len(CurNode.children) { + stack = append(stack, CurChild+1) + CurNode = CurNode.children[CurChild] + CurChild = 0 + + Depth := len(stack) + if Depth > 32 { + Depth = 32 + } + buf.Write(padSpace[:Depth]) + buf.WriteString(CurNode.description()) + buf.WriteRune('\n') + } else { + if len(stack) == 0 { + break + } + + CurChild = stack[len(stack)-1] + stack = stack[:len(stack)-1] + CurNode = CurNode.next + } + } + return buf.String() +} diff --git a/vendor/github.com/dlclark/regexp2/syntax/writer.go b/vendor/github.com/dlclark/regexp2/syntax/writer.go new file mode 100644 index 0000000..a5aa11c --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go @@ -0,0 +1,500 @@ +package syntax + +import ( + "bytes" + "fmt" + "math" + "os" +) + +func Write(tree *RegexTree) (*Code, error) { + w := writer{ + intStack: make([]int, 0, 32), + emitted: make([]int, 2), + stringhash: make(map[string]int), + sethash: make(map[string]int), + } + + code, err := w.codeFromTree(tree) + + if tree.options&Debug > 0 && code != nil { + os.Stdout.WriteString(code.Dump()) + os.Stdout.WriteString("\n") + } + + return code, err +} + +type writer struct { + emitted []int + + intStack []int + curpos int + stringhash map[string]int + stringtable [][]rune + sethash map[string]int + settable []*CharSet + counting bool + count int + trackcount int + caps map[int]int +} + +const ( + beforeChild nodeType = 64 + afterChild = 128 + //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix + MaxPrefixSize = 50 +) + +// The top level RegexCode generator. It does a depth-first walk +// through the tree and calls EmitFragment to emits code before +// and after each child of an interior node, and at each leaf. +// +// It runs two passes, first to count the size of the generated +// code, and second to generate the code. +// +// We should time it against the alternative, which is +// to just generate the code and grow the array as we go. +func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) { + var ( + curNode *regexNode + curChild int + capsize int + ) + // construct sparse capnum mapping if some numbers are unused + + if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) { + capsize = tree.captop + w.caps = nil + } else { + capsize = len(tree.capnumlist) + w.caps = tree.caps + for i := 0; i < len(tree.capnumlist); i++ { + w.caps[tree.capnumlist[i]] = i + } + } + + w.counting = true + + for { + if !w.counting { + w.emitted = make([]int, w.count) + } + + curNode = tree.root + curChild = 0 + + w.emit1(Lazybranch, 0) + + for { + if len(curNode.children) == 0 { + w.emitFragment(curNode.t, curNode, 0) + } else if curChild < len(curNode.children) { + w.emitFragment(curNode.t|beforeChild, curNode, curChild) + + curNode = curNode.children[curChild] + + w.pushInt(curChild) + curChild = 0 + continue + } + + if w.emptyStack() { + break + } + + curChild = w.popInt() + curNode = curNode.next + + w.emitFragment(curNode.t|afterChild, curNode, curChild) + curChild++ + } + + w.patchJump(0, w.curPos()) + w.emit(Stop) + + if !w.counting { + break + } + + w.counting = false + } + + fcPrefix := getFirstCharsPrefix(tree) + prefix := getPrefix(tree) + rtl := (tree.options & RightToLeft) != 0 + + var bmPrefix *BmPrefix + //TODO: benchmark string prefixes + if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 { + if len(prefix.PrefixStr) > MaxPrefixSize { + // limit prefix changes to 10k + prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize] + } + bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl) + } else { + bmPrefix = nil + } + + return &Code{ + Codes: w.emitted, + Strings: w.stringtable, + Sets: w.settable, + TrackCount: w.trackcount, + Caps: w.caps, + Capsize: capsize, + FcPrefix: fcPrefix, + BmPrefix: bmPrefix, + Anchors: getAnchors(tree), + RightToLeft: rtl, + }, nil +} + +// The main RegexCode generator. It does a depth-first walk +// through the tree and calls EmitFragment to emits code before +// and after each child of an interior node, and at each leaf. +func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error { + bits := InstOp(0) + + if nodetype <= ntRef { + if (node.options & RightToLeft) != 0 { + bits |= Rtl + } + if (node.options & IgnoreCase) != 0 { + bits |= Ci + } + } + ntBits := nodeType(bits) + + switch nodetype { + case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty: + break + + case ntAlternate | beforeChild: + if curIndex < len(node.children)-1 { + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + } + + case ntAlternate | afterChild: + if curIndex < len(node.children)-1 { + lbPos := w.popInt() + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + w.patchJump(lbPos, w.curPos()) + } else { + for i := 0; i < curIndex; i++ { + w.patchJump(w.popInt(), w.curPos()) + } + } + break + + case ntTestref | beforeChild: + if curIndex == 0 { + w.emit(Setjump) + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + w.emit1(Testref, w.mapCapnum(node.m)) + w.emit(Forejump) + } + + case ntTestref | afterChild: + if curIndex == 0 { + branchpos := w.popInt() + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + w.patchJump(branchpos, w.curPos()) + w.emit(Forejump) + if len(node.children) <= 1 { + w.patchJump(w.popInt(), w.curPos()) + } + } else if curIndex == 1 { + w.patchJump(w.popInt(), w.curPos()) + } + + case ntTestgroup | beforeChild: + if curIndex == 0 { + w.emit(Setjump) + w.emit(Setmark) + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + } + + case ntTestgroup | afterChild: + if curIndex == 0 { + w.emit(Getmark) + w.emit(Forejump) + } else if curIndex == 1 { + Branchpos := w.popInt() + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + w.patchJump(Branchpos, w.curPos()) + w.emit(Getmark) + w.emit(Forejump) + if len(node.children) <= 2 { + w.patchJump(w.popInt(), w.curPos()) + } + } else if curIndex == 2 { + w.patchJump(w.popInt(), w.curPos()) + } + + case ntLoop | beforeChild, ntLazyloop | beforeChild: + + if node.n < math.MaxInt32 || node.m > 1 { + if node.m == 0 { + w.emit1(Nullcount, 0) + } else { + w.emit1(Setcount, 1-node.m) + } + } else if node.m == 0 { + w.emit(Nullmark) + } else { + w.emit(Setmark) + } + + if node.m == 0 { + w.pushInt(w.curPos()) + w.emit1(Goto, 0) + } + w.pushInt(w.curPos()) + + case ntLoop | afterChild, ntLazyloop | afterChild: + + startJumpPos := w.curPos() + lazy := (nodetype - (ntLoop | afterChild)) + + if node.n < math.MaxInt32 || node.m > 1 { + if node.n == math.MaxInt32 { + w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32) + } else { + w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m) + } + } else { + w.emit1(InstOp(Branchmark+lazy), w.popInt()) + } + + if node.m == 0 { + w.patchJump(w.popInt(), startJumpPos) + } + + case ntGroup | beforeChild, ntGroup | afterChild: + + case ntCapture | beforeChild: + w.emit(Setmark) + + case ntCapture | afterChild: + w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n)) + + case ntRequire | beforeChild: + // NOTE: the following line causes lookahead/lookbehind to be + // NON-BACKTRACKING. It can be commented out with (*) + w.emit(Setjump) + + w.emit(Setmark) + + case ntRequire | afterChild: + w.emit(Getmark) + + // NOTE: the following line causes lookahead/lookbehind to be + // NON-BACKTRACKING. It can be commented out with (*) + w.emit(Forejump) + + case ntPrevent | beforeChild: + w.emit(Setjump) + w.pushInt(w.curPos()) + w.emit1(Lazybranch, 0) + + case ntPrevent | afterChild: + w.emit(Backjump) + w.patchJump(w.popInt(), w.curPos()) + w.emit(Forejump) + + case ntGreedy | beforeChild: + w.emit(Setjump) + + case ntGreedy | afterChild: + w.emit(Forejump) + + case ntOne, ntNotone: + w.emit1(InstOp(node.t|ntBits), int(node.ch)) + + case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy: + if node.m > 0 { + if node.t == ntOneloop || node.t == ntOnelazy { + w.emit2(Onerep|bits, int(node.ch), node.m) + } else { + w.emit2(Notonerep|bits, int(node.ch), node.m) + } + } + if node.n > node.m { + if node.n == math.MaxInt32 { + w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32) + } else { + w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m) + } + } + + case ntSetloop, ntSetlazy: + if node.m > 0 { + w.emit2(Setrep|bits, w.setCode(node.set), node.m) + } + if node.n > node.m { + if node.n == math.MaxInt32 { + w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32) + } else { + w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m) + } + } + + case ntMulti: + w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str)) + + case ntSet: + w.emit1(InstOp(node.t|ntBits), w.setCode(node.set)) + + case ntRef: + w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m)) + + case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: + w.emit(InstOp(node.t)) + + default: + return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype) + } + + return nil +} + +// To avoid recursion, we use a simple integer stack. +// This is the push. +func (w *writer) pushInt(i int) { + w.intStack = append(w.intStack, i) +} + +// Returns true if the stack is empty. +func (w *writer) emptyStack() bool { + return len(w.intStack) == 0 +} + +// This is the pop. +func (w *writer) popInt() int { + //get our item + idx := len(w.intStack) - 1 + i := w.intStack[idx] + //trim our slice + w.intStack = w.intStack[:idx] + return i +} + +// Returns the current position in the emitted code. +func (w *writer) curPos() int { + return w.curpos +} + +// Fixes up a jump instruction at the specified offset +// so that it jumps to the specified jumpDest. +func (w *writer) patchJump(offset, jumpDest int) { + w.emitted[offset+1] = jumpDest +} + +// Returns an index in the set table for a charset +// uses a map to eliminate duplicates. +func (w *writer) setCode(set *CharSet) int { + if w.counting { + return 0 + } + + buf := &bytes.Buffer{} + + set.mapHashFill(buf) + hash := buf.String() + i, ok := w.sethash[hash] + if !ok { + i = len(w.sethash) + w.sethash[hash] = i + w.settable = append(w.settable, set) + } + return i +} + +// Returns an index in the string table for a string. +// uses a map to eliminate duplicates. +func (w *writer) stringCode(str []rune) int { + if w.counting { + return 0 + } + + hash := string(str) + i, ok := w.stringhash[hash] + if !ok { + i = len(w.stringhash) + w.stringhash[hash] = i + w.stringtable = append(w.stringtable, str) + } + + return i +} + +// When generating code on a regex that uses a sparse set +// of capture slots, we hash them to a dense set of indices +// for an array of capture slots. Instead of doing the hash +// at match time, it's done at compile time, here. +func (w *writer) mapCapnum(capnum int) int { + if capnum == -1 { + return -1 + } + + if w.caps != nil { + return w.caps[capnum] + } + + return capnum +} + +// Emits a zero-argument operation. Note that the emit +// functions all run in two modes: they can emit code, or +// they can just count the size of the code. +func (w *writer) emit(op InstOp) { + if w.counting { + w.count++ + if opcodeBacktracks(op) { + w.trackcount++ + } + return + } + w.emitted[w.curpos] = int(op) + w.curpos++ +} + +// Emits a one-argument operation. +func (w *writer) emit1(op InstOp, opd1 int) { + if w.counting { + w.count += 2 + if opcodeBacktracks(op) { + w.trackcount++ + } + return + } + w.emitted[w.curpos] = int(op) + w.curpos++ + w.emitted[w.curpos] = opd1 + w.curpos++ +} + +// Emits a two-argument operation. +func (w *writer) emit2(op InstOp, opd1, opd2 int) { + if w.counting { + w.count += 3 + if opcodeBacktracks(op) { + w.trackcount++ + } + return + } + w.emitted[w.curpos] = int(op) + w.curpos++ + w.emitted[w.curpos] = opd1 + w.curpos++ + w.emitted[w.curpos] = opd2 + w.curpos++ +} |
