aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/dlclark/regexp2/syntax
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax')
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/charclass.go865
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/code.go274
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/escape.go94
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/fuzz.go20
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/parser.go2251
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/prefix.go896
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/replacerdata.go87
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/tree.go654
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/writer.go500
9 files changed, 5641 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/charclass.go b/vendor/github.com/dlclark/regexp2/syntax/charclass.go
new file mode 100644
index 0000000..6881a0e
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/charclass.go
@@ -0,0 +1,865 @@
1package syntax
2
3import (
4 "bytes"
5 "encoding/binary"
6 "fmt"
7 "sort"
8 "unicode"
9 "unicode/utf8"
10)
11
12// CharSet combines start-end rune ranges and unicode categories representing a set of characters
13type CharSet struct {
14 ranges []singleRange
15 categories []category
16 sub *CharSet //optional subtractor
17 negate bool
18 anything bool
19}
20
21type category struct {
22 negate bool
23 cat string
24}
25
26type singleRange struct {
27 first rune
28 last rune
29}
30
31const (
32 spaceCategoryText = " "
33 wordCategoryText = "W"
34)
35
36var (
37 ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
38 ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
39 ecmaDigit = []rune{0x0030, 0x003a}
40
41 re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
42)
43
44var (
45 AnyClass = getCharSetFromOldString([]rune{0}, false)
46 ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
47 NoneClass = getCharSetFromOldString(nil, false)
48 ECMAWordClass = getCharSetFromOldString(ecmaWord, false)
49 NotECMAWordClass = getCharSetFromOldString(ecmaWord, true)
50 ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false)
51 NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
52 ECMADigitClass = getCharSetFromOldString(ecmaDigit, false)
53 NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
54
55 WordClass = getCharSetFromCategoryString(false, false, wordCategoryText)
56 NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText)
57 SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText)
58 NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
59 DigitClass = getCharSetFromCategoryString(false, false, "Nd")
60 NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
61
62 RE2SpaceClass = getCharSetFromOldString(re2Space, false)
63 NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
64)
65
66var unicodeCategories = func() map[string]*unicode.RangeTable {
67 retVal := make(map[string]*unicode.RangeTable)
68 for k, v := range unicode.Scripts {
69 retVal[k] = v
70 }
71 for k, v := range unicode.Categories {
72 retVal[k] = v
73 }
74 for k, v := range unicode.Properties {
75 retVal[k] = v
76 }
77 return retVal
78}()
79
80func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
81 if negateCat && negateSet {
82 panic("BUG! You should only negate the set OR the category in a constant setup, but not both")
83 }
84
85 c := CharSet{negate: negateSet}
86
87 c.categories = make([]category, len(cats))
88 for i, cat := range cats {
89 c.categories[i] = category{cat: cat, negate: negateCat}
90 }
91 return func() *CharSet {
92 //make a copy each time
93 local := c
94 //return that address
95 return &local
96 }
97}
98
99func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
100 c := CharSet{}
101 if len(setText) > 0 {
102 fillFirst := false
103 l := len(setText)
104 if negate {
105 if setText[0] == 0 {
106 setText = setText[1:]
107 } else {
108 l++
109 fillFirst = true
110 }
111 }
112
113 if l%2 == 0 {
114 c.ranges = make([]singleRange, l/2)
115 } else {
116 c.ranges = make([]singleRange, l/2+1)
117 }
118
119 first := true
120 if fillFirst {
121 c.ranges[0] = singleRange{first: 0}
122 first = false
123 }
124
125 i := 0
126 for _, r := range setText {
127 if first {
128 // lower bound in a new range
129 c.ranges[i] = singleRange{first: r}
130 first = false
131 } else {
132 c.ranges[i].last = r - 1
133 i++
134 first = true
135 }
136 }
137 if !first {
138 c.ranges[i].last = utf8.MaxRune
139 }
140 }
141
142 return func() *CharSet {
143 local := c
144 return &local
145 }
146}
147
148// Copy makes a deep copy to prevent accidental mutation of a set
149func (c CharSet) Copy() CharSet {
150 ret := CharSet{
151 anything: c.anything,
152 negate: c.negate,
153 }
154
155 ret.ranges = append(ret.ranges, c.ranges...)
156 ret.categories = append(ret.categories, c.categories...)
157
158 if c.sub != nil {
159 sub := c.sub.Copy()
160 ret.sub = &sub
161 }
162
163 return ret
164}
165
166// gets a human-readable description for a set string
167func (c CharSet) String() string {
168 buf := &bytes.Buffer{}
169 buf.WriteRune('[')
170
171 if c.IsNegated() {
172 buf.WriteRune('^')
173 }
174
175 for _, r := range c.ranges {
176
177 buf.WriteString(CharDescription(r.first))
178 if r.first != r.last {
179 if r.last-r.first != 1 {
180 //groups that are 1 char apart skip the dash
181 buf.WriteRune('-')
182 }
183 buf.WriteString(CharDescription(r.last))
184 }
185 }
186
187 for _, c := range c.categories {
188 buf.WriteString(c.String())
189 }
190
191 if c.sub != nil {
192 buf.WriteRune('-')
193 buf.WriteString(c.sub.String())
194 }
195
196 buf.WriteRune(']')
197
198 return buf.String()
199}
200
201// mapHashFill converts a charset into a buffer for use in maps
202func (c CharSet) mapHashFill(buf *bytes.Buffer) {
203 if c.negate {
204 buf.WriteByte(0)
205 } else {
206 buf.WriteByte(1)
207 }
208
209 binary.Write(buf, binary.LittleEndian, len(c.ranges))
210 binary.Write(buf, binary.LittleEndian, len(c.categories))
211 for _, r := range c.ranges {
212 buf.WriteRune(r.first)
213 buf.WriteRune(r.last)
214 }
215 for _, ct := range c.categories {
216 buf.WriteString(ct.cat)
217 if ct.negate {
218 buf.WriteByte(1)
219 } else {
220 buf.WriteByte(0)
221 }
222 }
223
224 if c.sub != nil {
225 c.sub.mapHashFill(buf)
226 }
227}
228
229// CharIn returns true if the rune is in our character set (either ranges or categories).
230// It handles negations and subtracted sub-charsets.
231func (c CharSet) CharIn(ch rune) bool {
232 val := false
233 // in s && !s.subtracted
234
235 //check ranges
236 for _, r := range c.ranges {
237 if ch < r.first {
238 continue
239 }
240 if ch <= r.last {
241 val = true
242 break
243 }
244 }
245
246 //check categories if we haven't already found a range
247 if !val && len(c.categories) > 0 {
248 for _, ct := range c.categories {
249 // special categories...then unicode
250 if ct.cat == spaceCategoryText {
251 if unicode.IsSpace(ch) {
252 // we found a space so we're done
253 // negate means this is a "bad" thing
254 val = !ct.negate
255 break
256 } else if ct.negate {
257 val = true
258 break
259 }
260 } else if ct.cat == wordCategoryText {
261 if IsWordChar(ch) {
262 val = !ct.negate
263 break
264 } else if ct.negate {
265 val = true
266 break
267 }
268 } else if unicode.Is(unicodeCategories[ct.cat], ch) {
269 // if we're in this unicode category then we're done
270 // if negate=true on this category then we "failed" our test
271 // otherwise we're good that we found it
272 val = !ct.negate
273 break
274 } else if ct.negate {
275 val = true
276 break
277 }
278 }
279 }
280
281 // negate the whole char set
282 if c.negate {
283 val = !val
284 }
285
286 // get subtracted recurse
287 if val && c.sub != nil {
288 val = !c.sub.CharIn(ch)
289 }
290
291 //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
292 return val
293}
294
295func (c category) String() string {
296 switch c.cat {
297 case spaceCategoryText:
298 if c.negate {
299 return "\\S"
300 }
301 return "\\s"
302 case wordCategoryText:
303 if c.negate {
304 return "\\W"
305 }
306 return "\\w"
307 }
308 if _, ok := unicodeCategories[c.cat]; ok {
309
310 if c.negate {
311 return "\\P{" + c.cat + "}"
312 }
313 return "\\p{" + c.cat + "}"
314 }
315 return "Unknown category: " + c.cat
316}
317
318// CharDescription Produces a human-readable description for a single character.
319func CharDescription(ch rune) string {
320 /*if ch == '\\' {
321 return "\\\\"
322 }
323
324 if ch > ' ' && ch <= '~' {
325 return string(ch)
326 } else if ch == '\n' {
327 return "\\n"
328 } else if ch == ' ' {
329 return "\\ "
330 }*/
331
332 b := &bytes.Buffer{}
333 escape(b, ch, false) //fmt.Sprintf("%U", ch)
334 return b.String()
335}
336
337// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
338// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
339// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
340// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
341func IsWordChar(r rune) bool {
342 //"L", "Mn", "Nd", "Pc"
343 return unicode.In(r,
344 unicode.Categories["L"], unicode.Categories["Mn"],
345 unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
346 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
347}
348
349func IsECMAWordChar(r rune) bool {
350 return unicode.In(r,
351 unicode.Categories["L"], unicode.Categories["Mn"],
352 unicode.Categories["Nd"], unicode.Categories["Pc"])
353
354 //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
355}
356
357// SingletonChar will return the char from the first range without validation.
358// It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
359func (c CharSet) SingletonChar() rune {
360 return c.ranges[0].first
361}
362
363func (c CharSet) IsSingleton() bool {
364 return !c.negate && //negated is multiple chars
365 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
366 c.sub == nil && // subtraction means we've got multiple chars
367 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
368}
369
370func (c CharSet) IsSingletonInverse() bool {
371 return c.negate && //same as above, but requires negated
372 len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
373 c.sub == nil && // subtraction means we've got multiple chars
374 c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
375}
376
377func (c CharSet) IsMergeable() bool {
378 return !c.IsNegated() && !c.HasSubtraction()
379}
380
381func (c CharSet) IsNegated() bool {
382 return c.negate
383}
384
385func (c CharSet) HasSubtraction() bool {
386 return c.sub != nil
387}
388
389func (c CharSet) IsEmpty() bool {
390 return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
391}
392
393func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
394 if ecma {
395 if negate {
396 c.addRanges(NotECMADigitClass().ranges)
397 } else {
398 c.addRanges(ECMADigitClass().ranges)
399 }
400 } else {
401 c.addCategories(category{cat: "Nd", negate: negate})
402 }
403}
404
405func (c *CharSet) addChar(ch rune) {
406 c.addRange(ch, ch)
407}
408
409func (c *CharSet) addSpace(ecma, re2, negate bool) {
410 if ecma {
411 if negate {
412 c.addRanges(NotECMASpaceClass().ranges)
413 } else {
414 c.addRanges(ECMASpaceClass().ranges)
415 }
416 } else if re2 {
417 if negate {
418 c.addRanges(NotRE2SpaceClass().ranges)
419 } else {
420 c.addRanges(RE2SpaceClass().ranges)
421 }
422 } else {
423 c.addCategories(category{cat: spaceCategoryText, negate: negate})
424 }
425}
426
427func (c *CharSet) addWord(ecma, negate bool) {
428 if ecma {
429 if negate {
430 c.addRanges(NotECMAWordClass().ranges)
431 } else {
432 c.addRanges(ECMAWordClass().ranges)
433 }
434 } else {
435 c.addCategories(category{cat: wordCategoryText, negate: negate})
436 }
437}
438
439// Add set ranges and categories into ours -- no deduping or anything
440func (c *CharSet) addSet(set CharSet) {
441 if c.anything {
442 return
443 }
444 if set.anything {
445 c.makeAnything()
446 return
447 }
448 // just append here to prevent double-canon
449 c.ranges = append(c.ranges, set.ranges...)
450 c.addCategories(set.categories...)
451 c.canonicalize()
452}
453
454func (c *CharSet) makeAnything() {
455 c.anything = true
456 c.categories = []category{}
457 c.ranges = AnyClass().ranges
458}
459
460func (c *CharSet) addCategories(cats ...category) {
461 // don't add dupes and remove positive+negative
462 if c.anything {
463 // if we've had a previous positive+negative group then
464 // just return, we're as broad as we can get
465 return
466 }
467
468 for _, ct := range cats {
469 found := false
470 for _, ct2 := range c.categories {
471 if ct.cat == ct2.cat {
472 if ct.negate != ct2.negate {
473 // oposite negations...this mean we just
474 // take us as anything and move on
475 c.makeAnything()
476 return
477 }
478 found = true
479 break
480 }
481 }
482
483 if !found {
484 c.categories = append(c.categories, ct)
485 }
486 }
487}
488
489// Merges new ranges to our own
490func (c *CharSet) addRanges(ranges []singleRange) {
491 if c.anything {
492 return
493 }
494 c.ranges = append(c.ranges, ranges...)
495 c.canonicalize()
496}
497
498// Merges everything but the new ranges into our own
499func (c *CharSet) addNegativeRanges(ranges []singleRange) {
500 if c.anything {
501 return
502 }
503
504 var hi rune
505
506 // convert incoming ranges into opposites, assume they are in order
507 for _, r := range ranges {
508 if hi < r.first {
509 c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
510 }
511 hi = r.last + 1
512 }
513
514 if hi < utf8.MaxRune {
515 c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
516 }
517
518 c.canonicalize()
519}
520
521func isValidUnicodeCat(catName string) bool {
522 _, ok := unicodeCategories[catName]
523 return ok
524}
525
526func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
527 if !isValidUnicodeCat(categoryName) {
528 // unknown unicode category, script, or property "blah"
529 panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
530
531 }
532
533 if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
534 // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
535 c.addCategories(
536 category{cat: "Ll", negate: negate},
537 category{cat: "Lu", negate: negate},
538 category{cat: "Lt", negate: negate})
539 }
540 c.addCategories(category{cat: categoryName, negate: negate})
541}
542
543func (c *CharSet) addSubtraction(sub *CharSet) {
544 c.sub = sub
545}
546
547func (c *CharSet) addRange(chMin, chMax rune) {
548 c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
549 c.canonicalize()
550}
551
552func (c *CharSet) addNamedASCII(name string, negate bool) bool {
553 var rs []singleRange
554
555 switch name {
556 case "alnum":
557 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
558 case "alpha":
559 rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
560 case "ascii":
561 rs = []singleRange{singleRange{0, 0x7f}}
562 case "blank":
563 rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
564 case "cntrl":
565 rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
566 case "digit":
567 c.addDigit(false, negate, "")
568 case "graph":
569 rs = []singleRange{singleRange{'!', '~'}}
570 case "lower":
571 rs = []singleRange{singleRange{'a', 'z'}}
572 case "print":
573 rs = []singleRange{singleRange{' ', '~'}}
574 case "punct": //[!-/:-@[-`{-~]
575 rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
576 case "space":
577 c.addSpace(true, false, negate)
578 case "upper":
579 rs = []singleRange{singleRange{'A', 'Z'}}
580 case "word":
581 c.addWord(true, negate)
582 case "xdigit":
583 rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
584 default:
585 return false
586 }
587
588 if len(rs) > 0 {
589 if negate {
590 c.addNegativeRanges(rs)
591 } else {
592 c.addRanges(rs)
593 }
594 }
595
596 return true
597}
598
599type singleRangeSorter []singleRange
600
601func (p singleRangeSorter) Len() int { return len(p) }
602func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
603func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
604
605// Logic to reduce a character class to a unique, sorted form.
606func (c *CharSet) canonicalize() {
607 var i, j int
608 var last rune
609
610 //
611 // Find and eliminate overlapping or abutting ranges
612 //
613
614 if len(c.ranges) > 1 {
615 sort.Sort(singleRangeSorter(c.ranges))
616
617 done := false
618
619 for i, j = 1, 0; ; i++ {
620 for last = c.ranges[j].last; ; i++ {
621 if i == len(c.ranges) || last == utf8.MaxRune {
622 done = true
623 break
624 }
625
626 CurrentRange := c.ranges[i]
627 if CurrentRange.first > last+1 {
628 break
629 }
630
631 if last < CurrentRange.last {
632 last = CurrentRange.last
633 }
634 }
635
636 c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
637
638 j++
639
640 if done {
641 break
642 }
643
644 if j < i {
645 c.ranges[j] = c.ranges[i]
646 }
647 }
648
649 c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
650 }
651}
652
653// Adds to the class any lowercase versions of characters already
654// in the class. Used for case-insensitivity.
655func (c *CharSet) addLowercase() {
656 if c.anything {
657 return
658 }
659 toAdd := []singleRange{}
660 for i := 0; i < len(c.ranges); i++ {
661 r := c.ranges[i]
662 if r.first == r.last {
663 lower := unicode.ToLower(r.first)
664 c.ranges[i] = singleRange{first: lower, last: lower}
665 } else {
666 toAdd = append(toAdd, r)
667 }
668 }
669
670 for _, r := range toAdd {
671 c.addLowercaseRange(r.first, r.last)
672 }
673 c.canonicalize()
674}
675
676/**************************************************************************
677 Let U be the set of Unicode character values and let L be the lowercase
678 function, mapping from U to U. To perform case insensitive matching of
679 character sets, we need to be able to map an interval I in U, say
680
681 I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
682
683 to a set A such that A contains L(I) and A is contained in the union of
684 I and L(I).
685
686 The table below partitions U into intervals on which L is non-decreasing.
687 Thus, for any interval J = [a, b] contained in one of these intervals,
688 L(J) is contained in [L(a), L(b)].
689
690 It is also true that for any such J, [L(a), L(b)] is contained in the
691 union of J and L(J). This does not follow from L being non-decreasing on
692 these intervals. It follows from the nature of the L on each interval.
693 On each interval, L has one of the following forms:
694
695 (1) L(ch) = constant (LowercaseSet)
696 (2) L(ch) = ch + offset (LowercaseAdd)
697 (3) L(ch) = ch | 1 (LowercaseBor)
698 (4) L(ch) = ch + (ch & 1) (LowercaseBad)
699
700 It is easy to verify that for any of these forms [L(a), L(b)] is
701 contained in the union of [a, b] and L([a, b]).
702***************************************************************************/
703
704const (
705 LowercaseSet = 0 // Set to arg.
706 LowercaseAdd = 1 // Add arg.
707 LowercaseBor = 2 // Bitwise or with 1.
708 LowercaseBad = 3 // Bitwise and with 1 and add original.
709)
710
711type lcMap struct {
712 chMin, chMax rune
713 op, data int32
714}
715
716var lcTable = []lcMap{
717 lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
718 lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
719 lcMap{'\u0100', '\u012E', LowercaseBor, 0},
720 lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
721 lcMap{'\u0132', '\u0136', LowercaseBor, 0},
722 lcMap{'\u0139', '\u0147', LowercaseBad, 0},
723 lcMap{'\u014A', '\u0176', LowercaseBor, 0},
724 lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
725 lcMap{'\u0179', '\u017D', LowercaseBad, 0},
726 lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
727 lcMap{'\u0182', '\u0184', LowercaseBor, 0},
728 lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
729 lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
730 lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
731 lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
732 lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
733 lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
734 lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
735 lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
736 lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
737 lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
738 lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
739 lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
740 lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
741 lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
742 lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
743 lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
744 lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
745 lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
746 lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
747 lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
748 lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
749 lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
750 lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
751 lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
752 lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
753 lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
754 lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
755 lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
756 lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
757 lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
758 lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
759 lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
760 lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
761 lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
762 lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
763 lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
764 lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
765 lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
766 lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
767 lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
768 lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
769 lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
770 lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
771 lcMap{'\u0460', '\u0480', LowercaseBor, 0},
772 lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
773 lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
774 lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
775 lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
776 lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
777 lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
778 lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
779 lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
780 lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
781 lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
782 lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
783 lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
784 lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
785 lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
786 lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
787 lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
788 lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
789 lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
790 lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
791 lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
792 lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
793 lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
794 lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
795 lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
796 lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
797 lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
798 lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
799 lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
800 lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
801 lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
802 lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
803 lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
804 lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
805 lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
806 lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
807 lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
808 lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
809 lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
810 lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
811}
812
813func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
814 var i, iMax, iMid int
815 var chMinT, chMaxT rune
816 var lc lcMap
817
818 for i, iMax = 0, len(lcTable); i < iMax; {
819 iMid = (i + iMax) / 2
820 if lcTable[iMid].chMax < chMin {
821 i = iMid + 1
822 } else {
823 iMax = iMid
824 }
825 }
826
827 for ; i < len(lcTable); i++ {
828 lc = lcTable[i]
829 if lc.chMin > chMax {
830 return
831 }
832 chMinT = lc.chMin
833 if chMinT < chMin {
834 chMinT = chMin
835 }
836
837 chMaxT = lc.chMax
838 if chMaxT > chMax {
839 chMaxT = chMax
840 }
841
842 switch lc.op {
843 case LowercaseSet:
844 chMinT = rune(lc.data)
845 chMaxT = rune(lc.data)
846 break
847 case LowercaseAdd:
848 chMinT += lc.data
849 chMaxT += lc.data
850 break
851 case LowercaseBor:
852 chMinT |= 1
853 chMaxT |= 1
854 break
855 case LowercaseBad:
856 chMinT += (chMinT & 1)
857 chMaxT += (chMaxT & 1)
858 break
859 }
860
861 if chMinT < chMin || chMaxT > chMax {
862 c.addRange(chMinT, chMaxT)
863 }
864 }
865}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/code.go b/vendor/github.com/dlclark/regexp2/syntax/code.go
new file mode 100644
index 0000000..686e822
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/code.go
@@ -0,0 +1,274 @@
1package syntax
2
3import (
4 "bytes"
5 "fmt"
6 "math"
7)
8
9// similar to prog.go in the go regex package...also with comment 'may not belong in this package'
10
11// File provides operator constants for use by the Builder and the Machine.
12
13// Implementation notes:
14//
15// Regexps are built into RegexCodes, which contain an operation array,
16// a string table, and some constants.
17//
18// Each operation is one of the codes below, followed by the integer
19// operands specified for each op.
20//
21// Strings and sets are indices into a string table.
22
23type InstOp int
24
25const (
26 // lef/back operands description
27
28 Onerep InstOp = 0 // lef,back char,min,max a {n}
29 Notonerep = 1 // lef,back char,min,max .{n}
30 Setrep = 2 // lef,back set,min,max [\d]{n}
31
32 Oneloop = 3 // lef,back char,min,max a {,n}
33 Notoneloop = 4 // lef,back char,min,max .{,n}
34 Setloop = 5 // lef,back set,min,max [\d]{,n}
35
36 Onelazy = 6 // lef,back char,min,max a {,n}?
37 Notonelazy = 7 // lef,back char,min,max .{,n}?
38 Setlazy = 8 // lef,back set,min,max [\d]{,n}?
39
40 One = 9 // lef char a
41 Notone = 10 // lef char [^a]
42 Set = 11 // lef set [a-z\s] \w \s \d
43
44 Multi = 12 // lef string abcd
45 Ref = 13 // lef group \#
46
47 Bol = 14 // ^
48 Eol = 15 // $
49 Boundary = 16 // \b
50 Nonboundary = 17 // \B
51 Beginning = 18 // \A
52 Start = 19 // \G
53 EndZ = 20 // \Z
54 End = 21 // \Z
55
56 Nothing = 22 // Reject!
57
58 // Primitive control structures
59
60 Lazybranch = 23 // back jump straight first
61 Branchmark = 24 // back jump branch first for loop
62 Lazybranchmark = 25 // back jump straight first for loop
63 Nullcount = 26 // back val set counter, null mark
64 Setcount = 27 // back val set counter, make mark
65 Branchcount = 28 // back jump,limit branch++ if zero<=c<limit
66 Lazybranchcount = 29 // back jump,limit same, but straight first
67 Nullmark = 30 // back save position
68 Setmark = 31 // back save position
69 Capturemark = 32 // back group define group
70 Getmark = 33 // back recall position
71 Setjump = 34 // back save backtrack state
72 Backjump = 35 // zap back to saved state
73 Forejump = 36 // zap backtracking state
74 Testref = 37 // backtrack if ref undefined
75 Goto = 38 // jump just go
76
77 Prune = 39 // prune it baby
78 Stop = 40 // done!
79
80 ECMABoundary = 41 // \b
81 NonECMABoundary = 42 // \B
82
83 // Modifiers for alternate modes
84
85 Mask = 63 // Mask to get unmodified ordinary operator
86 Rtl = 64 // bit to indicate that we're reverse scanning.
87 Back = 128 // bit to indicate that we're backtracking.
88 Back2 = 256 // bit to indicate that we're backtracking on a second branch.
89 Ci = 512 // bit to indicate that we're case-insensitive.
90)
91
92type Code struct {
93 Codes []int // the code
94 Strings [][]rune // string table
95 Sets []*CharSet //character set table
96 TrackCount int // how many instructions use backtracking
97 Caps map[int]int // mapping of user group numbers -> impl group slots
98 Capsize int // number of impl group slots
99 FcPrefix *Prefix // the set of candidate first characters (may be null)
100 BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null)
101 Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc)
102 RightToLeft bool // true if right to left
103}
104
105func opcodeBacktracks(op InstOp) bool {
106 op &= Mask
107
108 switch op {
109 case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
110 Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
111 Forejump, Goto:
112 return true
113
114 default:
115 return false
116 }
117}
118
119func opcodeSize(op InstOp) int {
120 op &= Mask
121
122 switch op {
123 case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
124 End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
125 return 1
126
127 case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
128 Prune, Set:
129 return 2
130
131 case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
132 Setlazy, Setrep, Setloop:
133 return 3
134
135 default:
136 panic(fmt.Errorf("Unexpected op code: %v", op))
137 }
138}
139
140var codeStr = []string{
141 "Onerep", "Notonerep", "Setrep",
142 "Oneloop", "Notoneloop", "Setloop",
143 "Onelazy", "Notonelazy", "Setlazy",
144 "One", "Notone", "Set",
145 "Multi", "Ref",
146 "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
147 "Nothing",
148 "Lazybranch", "Branchmark", "Lazybranchmark",
149 "Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
150 "Nullmark", "Setmark", "Capturemark", "Getmark",
151 "Setjump", "Backjump", "Forejump", "Testref", "Goto",
152 "Prune", "Stop",
153 "ECMABoundary", "NonECMABoundary",
154}
155
156func operatorDescription(op InstOp) string {
157 desc := codeStr[op&Mask]
158 if (op & Ci) != 0 {
159 desc += "-Ci"
160 }
161 if (op & Rtl) != 0 {
162 desc += "-Rtl"
163 }
164 if (op & Back) != 0 {
165 desc += "-Back"
166 }
167 if (op & Back2) != 0 {
168 desc += "-Back2"
169 }
170
171 return desc
172}
173
174// OpcodeDescription is a humman readable string of the specific offset
175func (c *Code) OpcodeDescription(offset int) string {
176 buf := &bytes.Buffer{}
177
178 op := InstOp(c.Codes[offset])
179 fmt.Fprintf(buf, "%06d ", offset)
180
181 if opcodeBacktracks(op & Mask) {
182 buf.WriteString("*")
183 } else {
184 buf.WriteString(" ")
185 }
186 buf.WriteString(operatorDescription(op))
187 buf.WriteString("(")
188 op &= Mask
189
190 switch op {
191 case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
192 buf.WriteString("Ch = ")
193 buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
194
195 case Set, Setrep, Setloop, Setlazy:
196 buf.WriteString("Set = ")
197 buf.WriteString(c.Sets[c.Codes[offset+1]].String())
198
199 case Multi:
200 fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
201
202 case Ref, Testref:
203 fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
204
205 case Capturemark:
206 fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
207 if c.Codes[offset+2] != -1 {
208 fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
209 }
210
211 case Nullcount, Setcount:
212 fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
213
214 case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
215 fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
216 }
217
218 switch op {
219 case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
220 buf.WriteString(", Rep = ")
221 if c.Codes[offset+2] == math.MaxInt32 {
222 buf.WriteString("inf")
223 } else {
224 fmt.Fprintf(buf, "%d", c.Codes[offset+2])
225 }
226
227 case Branchcount, Lazybranchcount:
228 buf.WriteString(", Limit = ")
229 if c.Codes[offset+2] == math.MaxInt32 {
230 buf.WriteString("inf")
231 } else {
232 fmt.Fprintf(buf, "%d", c.Codes[offset+2])
233 }
234
235 }
236
237 buf.WriteString(")")
238
239 return buf.String()
240}
241
242func (c *Code) Dump() string {
243 buf := &bytes.Buffer{}
244
245 if c.RightToLeft {
246 fmt.Fprintln(buf, "Direction: right-to-left")
247 } else {
248 fmt.Fprintln(buf, "Direction: left-to-right")
249 }
250 if c.FcPrefix == nil {
251 fmt.Fprintln(buf, "Firstchars: n/a")
252 } else {
253 fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
254 }
255
256 if c.BmPrefix == nil {
257 fmt.Fprintln(buf, "Prefix: n/a")
258 } else {
259 fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String()))
260 }
261
262 fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors)
263 fmt.Fprintln(buf)
264
265 if c.BmPrefix != nil {
266 fmt.Fprintln(buf, "BoyerMoore:")
267 fmt.Fprintln(buf, c.BmPrefix.Dump(" "))
268 }
269 for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
270 fmt.Fprintln(buf, c.OpcodeDescription(i))
271 }
272
273 return buf.String()
274}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/escape.go b/vendor/github.com/dlclark/regexp2/syntax/escape.go
new file mode 100644
index 0000000..609df10
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/escape.go
@@ -0,0 +1,94 @@
1package syntax
2
3import (
4 "bytes"
5 "strconv"
6 "strings"
7 "unicode"
8)
9
10func Escape(input string) string {
11 b := &bytes.Buffer{}
12 for _, r := range input {
13 escape(b, r, false)
14 }
15 return b.String()
16}
17
18const meta = `\.+*?()|[]{}^$# `
19
20func escape(b *bytes.Buffer, r rune, force bool) {
21 if unicode.IsPrint(r) {
22 if strings.IndexRune(meta, r) >= 0 || force {
23 b.WriteRune('\\')
24 }
25 b.WriteRune(r)
26 return
27 }
28
29 switch r {
30 case '\a':
31 b.WriteString(`\a`)
32 case '\f':
33 b.WriteString(`\f`)
34 case '\n':
35 b.WriteString(`\n`)
36 case '\r':
37 b.WriteString(`\r`)
38 case '\t':
39 b.WriteString(`\t`)
40 case '\v':
41 b.WriteString(`\v`)
42 default:
43 if r < 0x100 {
44 b.WriteString(`\x`)
45 s := strconv.FormatInt(int64(r), 16)
46 if len(s) == 1 {
47 b.WriteRune('0')
48 }
49 b.WriteString(s)
50 break
51 }
52 b.WriteString(`\u`)
53 b.WriteString(strconv.FormatInt(int64(r), 16))
54 }
55}
56
57func Unescape(input string) (string, error) {
58 idx := strings.IndexRune(input, '\\')
59 // no slashes means no unescape needed
60 if idx == -1 {
61 return input, nil
62 }
63
64 buf := bytes.NewBufferString(input[:idx])
65 // get the runes for the rest of the string -- we're going full parser scan on this
66
67 p := parser{}
68 p.setPattern(input[idx+1:])
69 for {
70 if p.rightMost() {
71 return "", p.getErr(ErrIllegalEndEscape)
72 }
73 r, err := p.scanCharEscape()
74 if err != nil {
75 return "", err
76 }
77 buf.WriteRune(r)
78 // are we done?
79 if p.rightMost() {
80 return buf.String(), nil
81 }
82
83 r = p.moveRightGetChar()
84 for r != '\\' {
85 buf.WriteRune(r)
86 if p.rightMost() {
87 // we're done, no more slashes
88 return buf.String(), nil
89 }
90 // keep scanning until we get another slash
91 r = p.moveRightGetChar()
92 }
93 }
94}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/fuzz.go b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go
new file mode 100644
index 0000000..ee86386
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go
@@ -0,0 +1,20 @@
1// +build gofuzz
2
3package syntax
4
5// Fuzz is the input point for go-fuzz
6func Fuzz(data []byte) int {
7 sdata := string(data)
8 tree, err := Parse(sdata, RegexOptions(0))
9 if err != nil {
10 return 0
11 }
12
13 // translate it to code
14 _, err = Write(tree)
15 if err != nil {
16 panic(err)
17 }
18
19 return 1
20}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/parser.go b/vendor/github.com/dlclark/regexp2/syntax/parser.go
new file mode 100644
index 0000000..9dc6e31
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/parser.go
@@ -0,0 +1,2251 @@
1package syntax
2
3import (
4 "fmt"
5 "math"
6 "os"
7 "sort"
8 "strconv"
9 "unicode"
10)
11
12type RegexOptions int32
13
14const (
15 IgnoreCase RegexOptions = 0x0001 // "i"
16 Multiline = 0x0002 // "m"
17 ExplicitCapture = 0x0004 // "n"
18 Compiled = 0x0008 // "c"
19 Singleline = 0x0010 // "s"
20 IgnorePatternWhitespace = 0x0020 // "x"
21 RightToLeft = 0x0040 // "r"
22 Debug = 0x0080 // "d"
23 ECMAScript = 0x0100 // "e"
24 RE2 = 0x0200 // RE2 compat mode
25 Unicode = 0x0400 // "u"
26)
27
28func optionFromCode(ch rune) RegexOptions {
29 // case-insensitive
30 switch ch {
31 case 'i', 'I':
32 return IgnoreCase
33 case 'r', 'R':
34 return RightToLeft
35 case 'm', 'M':
36 return Multiline
37 case 'n', 'N':
38 return ExplicitCapture
39 case 's', 'S':
40 return Singleline
41 case 'x', 'X':
42 return IgnorePatternWhitespace
43 case 'd', 'D':
44 return Debug
45 case 'e', 'E':
46 return ECMAScript
47 case 'u', 'U':
48 return Unicode
49 default:
50 return 0
51 }
52}
53
54// An Error describes a failure to parse a regular expression
55// and gives the offending expression.
56type Error struct {
57 Code ErrorCode
58 Expr string
59 Args []interface{}
60}
61
62func (e *Error) Error() string {
63 if len(e.Args) == 0 {
64 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
65 }
66 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
67}
68
69// An ErrorCode describes a failure to parse a regular expression.
70type ErrorCode string
71
72const (
73 // internal issue
74 ErrInternalError ErrorCode = "regexp/syntax: internal error"
75 // Parser errors
76 ErrUnterminatedComment = "unterminated comment"
77 ErrInvalidCharRange = "invalid character class range"
78 ErrInvalidRepeatSize = "invalid repeat count"
79 ErrInvalidUTF8 = "invalid UTF-8"
80 ErrCaptureGroupOutOfRange = "capture group number out of range"
81 ErrUnexpectedParen = "unexpected )"
82 ErrMissingParen = "missing closing )"
83 ErrMissingBrace = "missing closing }"
84 ErrInvalidRepeatOp = "invalid nested repetition operator"
85 ErrMissingRepeatArgument = "missing argument to repetition operator"
86 ErrConditionalExpression = "illegal conditional (?(...)) expression"
87 ErrTooManyAlternates = "too many | in (?()|)"
88 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v"
89 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator"
90 ErrCapNumNotZero = "capture number cannot be zero"
91 ErrUndefinedBackRef = "reference to undefined group number %v"
92 ErrUndefinedNameRef = "reference to undefined group name %v"
93 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named"
94 ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
95 ErrMalformedReference = "(?(%v) ) malformed"
96 ErrUndefinedReference = "(?(%v) ) reference to undefined group"
97 ErrIllegalEndEscape = "illegal \\ at end of pattern"
98 ErrMalformedSlashP = "malformed \\p{X} character escape"
99 ErrIncompleteSlashP = "incomplete \\p{X} character escape"
100 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'"
101 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v"
102 ErrMissingControl = "missing control character"
103 ErrUnrecognizedControl = "unrecognized control character"
104 ErrTooFewHex = "insufficient hexadecimal digits"
105 ErrInvalidHex = "hex values may not be larger than 0x10FFFF"
106 ErrMalformedNameRef = "malformed \\k<...> named back reference"
107 ErrBadClassInCharRange = "cannot include class \\%v in character range"
108 ErrUnterminatedBracket = "unterminated [] set"
109 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
110 ErrReversedCharRange = "[%c-%c] range in reverse order"
111)
112
113func (e ErrorCode) String() string {
114 return string(e)
115}
116
117type parser struct {
118 stack *regexNode
119 group *regexNode
120 alternation *regexNode
121 concatenation *regexNode
122 unit *regexNode
123
124 patternRaw string
125 pattern []rune
126
127 currentPos int
128 specialCase *unicode.SpecialCase
129
130 autocap int
131 capcount int
132 captop int
133 capsize int
134
135 caps map[int]int
136 capnames map[string]int
137
138 capnumlist []int
139 capnamelist []string
140
141 options RegexOptions
142 optionsStack []RegexOptions
143 ignoreNextParen bool
144}
145
146const (
147 maxValueDiv10 int = math.MaxInt32 / 10
148 maxValueMod10 = math.MaxInt32 % 10
149)
150
151// Parse converts a regex string into a parse tree
152func Parse(re string, op RegexOptions) (*RegexTree, error) {
153 p := parser{
154 options: op,
155 caps: make(map[int]int),
156 }
157 p.setPattern(re)
158
159 if err := p.countCaptures(); err != nil {
160 return nil, err
161 }
162
163 p.reset(op)
164 root, err := p.scanRegex()
165
166 if err != nil {
167 return nil, err
168 }
169 tree := &RegexTree{
170 root: root,
171 caps: p.caps,
172 capnumlist: p.capnumlist,
173 captop: p.captop,
174 Capnames: p.capnames,
175 Caplist: p.capnamelist,
176 options: op,
177 }
178
179 if tree.options&Debug > 0 {
180 os.Stdout.WriteString(tree.Dump())
181 }
182
183 return tree, nil
184}
185
186func (p *parser) setPattern(pattern string) {
187 p.patternRaw = pattern
188 p.pattern = make([]rune, 0, len(pattern))
189
190 //populate our rune array to handle utf8 encoding
191 for _, r := range pattern {
192 p.pattern = append(p.pattern, r)
193 }
194}
195func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
196 return &Error{Code: code, Expr: p.patternRaw, Args: args}
197}
198
199func (p *parser) noteCaptureSlot(i, pos int) {
200 if _, ok := p.caps[i]; !ok {
201 // the rhs of the hashtable isn't used in the parser
202 p.caps[i] = pos
203 p.capcount++
204
205 if p.captop <= i {
206 if i == math.MaxInt32 {
207 p.captop = i
208 } else {
209 p.captop = i + 1
210 }
211 }
212 }
213}
214
215func (p *parser) noteCaptureName(name string, pos int) {
216 if p.capnames == nil {
217 p.capnames = make(map[string]int)
218 }
219
220 if _, ok := p.capnames[name]; !ok {
221 p.capnames[name] = pos
222 p.capnamelist = append(p.capnamelist, name)
223 }
224}
225
226func (p *parser) assignNameSlots() {
227 if p.capnames != nil {
228 for _, name := range p.capnamelist {
229 for p.isCaptureSlot(p.autocap) {
230 p.autocap++
231 }
232 pos := p.capnames[name]
233 p.capnames[name] = p.autocap
234 p.noteCaptureSlot(p.autocap, pos)
235
236 p.autocap++
237 }
238 }
239
240 // if the caps array has at least one gap, construct the list of used slots
241 if p.capcount < p.captop {
242 p.capnumlist = make([]int, p.capcount)
243 i := 0
244
245 for k := range p.caps {
246 p.capnumlist[i] = k
247 i++
248 }
249
250 sort.Ints(p.capnumlist)
251 }
252
253 // merge capsnumlist into capnamelist
254 if p.capnames != nil || p.capnumlist != nil {
255 var oldcapnamelist []string
256 var next int
257 var k int
258
259 if p.capnames == nil {
260 oldcapnamelist = nil
261 p.capnames = make(map[string]int)
262 p.capnamelist = []string{}
263 next = -1
264 } else {
265 oldcapnamelist = p.capnamelist
266 p.capnamelist = []string{}
267 next = p.capnames[oldcapnamelist[0]]
268 }
269
270 for i := 0; i < p.capcount; i++ {
271 j := i
272 if p.capnumlist != nil {
273 j = p.capnumlist[i]
274 }
275
276 if next == j {
277 p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
278 k++
279
280 if k == len(oldcapnamelist) {
281 next = -1
282 } else {
283 next = p.capnames[oldcapnamelist[k]]
284 }
285
286 } else {
287 //feature: culture?
288 str := strconv.Itoa(j)
289 p.capnamelist = append(p.capnamelist, str)
290 p.capnames[str] = j
291 }
292 }
293 }
294}
295
296func (p *parser) consumeAutocap() int {
297 r := p.autocap
298 p.autocap++
299 return r
300}
301
302// CountCaptures is a prescanner for deducing the slots used for
303// captures by doing a partial tokenization of the pattern.
304func (p *parser) countCaptures() error {
305 var ch rune
306
307 p.noteCaptureSlot(0, 0)
308
309 p.autocap = 1
310
311 for p.charsRight() > 0 {
312 pos := p.textpos()
313 ch = p.moveRightGetChar()
314 switch ch {
315 case '\\':
316 if p.charsRight() > 0 {
317 p.scanBackslash(true)
318 }
319
320 case '#':
321 if p.useOptionX() {
322 p.moveLeft()
323 p.scanBlank()
324 }
325
326 case '[':
327 p.scanCharSet(false, true)
328
329 case ')':
330 if !p.emptyOptionsStack() {
331 p.popOptions()
332 }
333
334 case '(':
335 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
336 p.moveLeft()
337 p.scanBlank()
338 } else {
339 p.pushOptions()
340 if p.charsRight() > 0 && p.rightChar(0) == '?' {
341 // we have (?...
342 p.moveRight(1)
343
344 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
345 // named group: (?<... or (?'...
346
347 p.moveRight(1)
348 ch = p.rightChar(0)
349
350 if ch != '0' && IsWordChar(ch) {
351 if ch >= '1' && ch <= '9' {
352 dec, err := p.scanDecimal()
353 if err != nil {
354 return err
355 }
356 p.noteCaptureSlot(dec, pos)
357 } else {
358 p.noteCaptureName(p.scanCapname(), pos)
359 }
360 }
361 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
362 // RE2-compat (?P<)
363 p.moveRight(2)
364 ch = p.rightChar(0)
365 if IsWordChar(ch) {
366 p.noteCaptureName(p.scanCapname(), pos)
367 }
368
369 } else {
370 // (?...
371
372 // get the options if it's an option construct (?cimsx-cimsx...)
373 p.scanOptions()
374
375 if p.charsRight() > 0 {
376 if p.rightChar(0) == ')' {
377 // (?cimsx-cimsx)
378 p.moveRight(1)
379 p.popKeepOptions()
380 } else if p.rightChar(0) == '(' {
381 // alternation construct: (?(foo)yes|no)
382 // ignore the next paren so we don't capture the condition
383 p.ignoreNextParen = true
384
385 // break from here so we don't reset ignoreNextParen
386 continue
387 }
388 }
389 }
390 } else {
391 if !p.useOptionN() && !p.ignoreNextParen {
392 p.noteCaptureSlot(p.consumeAutocap(), pos)
393 }
394 }
395 }
396
397 p.ignoreNextParen = false
398
399 }
400 }
401
402 p.assignNameSlots()
403 return nil
404}
405
406func (p *parser) reset(topopts RegexOptions) {
407 p.currentPos = 0
408 p.autocap = 1
409 p.ignoreNextParen = false
410
411 if len(p.optionsStack) > 0 {
412 p.optionsStack = p.optionsStack[:0]
413 }
414
415 p.options = topopts
416 p.stack = nil
417}
418
419func (p *parser) scanRegex() (*regexNode, error) {
420 ch := '@' // nonspecial ch, means at beginning
421 isQuant := false
422
423 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
424
425 for p.charsRight() > 0 {
426 wasPrevQuantifier := isQuant
427 isQuant = false
428
429 if err := p.scanBlank(); err != nil {
430 return nil, err
431 }
432
433 startpos := p.textpos()
434
435 // move past all of the normal characters. We'll stop when we hit some kind of control character,
436 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
437 if p.useOptionX() {
438 for p.charsRight() > 0 {
439 ch = p.rightChar(0)
440 //UGLY: clean up, this is ugly
441 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
442 break
443 }
444 p.moveRight(1)
445 }
446 } else {
447 for p.charsRight() > 0 {
448 ch = p.rightChar(0)
449 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
450 break
451 }
452 p.moveRight(1)
453 }
454 }
455
456 endpos := p.textpos()
457
458 p.scanBlank()
459
460 if p.charsRight() == 0 {
461 ch = '!' // nonspecial, means at end
462 } else if ch = p.rightChar(0); isSpecial(ch) {
463 isQuant = isQuantifier(ch)
464 p.moveRight(1)
465 } else {
466 ch = ' ' // nonspecial, means at ordinary char
467 }
468
469 if startpos < endpos {
470 cchUnquantified := endpos - startpos
471 if isQuant {
472 cchUnquantified--
473 }
474 wasPrevQuantifier = false
475
476 if cchUnquantified > 0 {
477 p.addToConcatenate(startpos, cchUnquantified, false)
478 }
479
480 if isQuant {
481 p.addUnitOne(p.charAt(endpos - 1))
482 }
483 }
484
485 switch ch {
486 case '!':
487 goto BreakOuterScan
488
489 case ' ':
490 goto ContinueOuterScan
491
492 case '[':
493 cc, err := p.scanCharSet(p.useOptionI(), false)
494 if err != nil {
495 return nil, err
496 }
497 p.addUnitSet(cc)
498
499 case '(':
500 p.pushOptions()
501
502 if grouper, err := p.scanGroupOpen(); err != nil {
503 return nil, err
504 } else if grouper == nil {
505 p.popKeepOptions()
506 } else {
507 p.pushGroup()
508 p.startGroup(grouper)
509 }
510
511 continue
512
513 case '|':
514 p.addAlternate()
515 goto ContinueOuterScan
516
517 case ')':
518 if p.emptyStack() {
519 return nil, p.getErr(ErrUnexpectedParen)
520 }
521
522 if err := p.addGroup(); err != nil {
523 return nil, err
524 }
525 if err := p.popGroup(); err != nil {
526 return nil, err
527 }
528 p.popOptions()
529
530 if p.unit == nil {
531 goto ContinueOuterScan
532 }
533
534 case '\\':
535 n, err := p.scanBackslash(false)
536 if err != nil {
537 return nil, err
538 }
539 p.addUnitNode(n)
540
541 case '^':
542 if p.useOptionM() {
543 p.addUnitType(ntBol)
544 } else {
545 p.addUnitType(ntBeginning)
546 }
547
548 case '$':
549 if p.useOptionM() {
550 p.addUnitType(ntEol)
551 } else {
552 p.addUnitType(ntEndZ)
553 }
554
555 case '.':
556 if p.useOptionE() {
557 p.addUnitSet(ECMAAnyClass())
558 } else if p.useOptionS() {
559 p.addUnitSet(AnyClass())
560 } else {
561 p.addUnitNotone('\n')
562 }
563
564 case '{', '*', '+', '?':
565 if p.unit == nil {
566 if wasPrevQuantifier {
567 return nil, p.getErr(ErrInvalidRepeatOp)
568 } else {
569 return nil, p.getErr(ErrMissingRepeatArgument)
570 }
571 }
572 p.moveLeft()
573
574 default:
575 return nil, p.getErr(ErrInternalError)
576 }
577
578 if err := p.scanBlank(); err != nil {
579 return nil, err
580 }
581
582 if p.charsRight() > 0 {
583 isQuant = p.isTrueQuantifier()
584 }
585 if p.charsRight() == 0 || !isQuant {
586 //maintain odd C# assignment order -- not sure if required, could clean up?
587 p.addConcatenate()
588 goto ContinueOuterScan
589 }
590
591 ch = p.moveRightGetChar()
592
593 // Handle quantifiers
594 for p.unit != nil {
595 var min, max int
596 var lazy bool
597
598 switch ch {
599 case '*':
600 min = 0
601 max = math.MaxInt32
602
603 case '?':
604 min = 0
605 max = 1
606
607 case '+':
608 min = 1
609 max = math.MaxInt32
610
611 case '{':
612 {
613 var err error
614 startpos = p.textpos()
615 if min, err = p.scanDecimal(); err != nil {
616 return nil, err
617 }
618 max = min
619 if startpos < p.textpos() {
620 if p.charsRight() > 0 && p.rightChar(0) == ',' {
621 p.moveRight(1)
622 if p.charsRight() == 0 || p.rightChar(0) == '}' {
623 max = math.MaxInt32
624 } else {
625 if max, err = p.scanDecimal(); err != nil {
626 return nil, err
627 }
628 }
629 }
630 }
631
632 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
633 p.addConcatenate()
634 p.textto(startpos - 1)
635 goto ContinueOuterScan
636 }
637 }
638
639 default:
640 return nil, p.getErr(ErrInternalError)
641 }
642
643 if err := p.scanBlank(); err != nil {
644 return nil, err
645 }
646
647 if p.charsRight() == 0 || p.rightChar(0) != '?' {
648 lazy = false
649 } else {
650 p.moveRight(1)
651 lazy = true
652 }
653
654 if min > max {
655 return nil, p.getErr(ErrInvalidRepeatSize)
656 }
657
658 p.addConcatenate3(lazy, min, max)
659 }
660
661 ContinueOuterScan:
662 }
663
664BreakOuterScan:
665 ;
666
667 if !p.emptyStack() {
668 return nil, p.getErr(ErrMissingParen)
669 }
670
671 if err := p.addGroup(); err != nil {
672 return nil, err
673 }
674
675 return p.unit, nil
676
677}
678
679/*
680 * Simple parsing for replacement patterns
681 */
682func (p *parser) scanReplacement() (*regexNode, error) {
683 var c, startpos int
684
685 p.concatenation = newRegexNode(ntConcatenate, p.options)
686
687 for {
688 c = p.charsRight()
689 if c == 0 {
690 break
691 }
692
693 startpos = p.textpos()
694
695 for c > 0 && p.rightChar(0) != '$' {
696 p.moveRight(1)
697 c--
698 }
699
700 p.addToConcatenate(startpos, p.textpos()-startpos, true)
701
702 if c > 0 {
703 if p.moveRightGetChar() == '$' {
704 n, err := p.scanDollar()
705 if err != nil {
706 return nil, err
707 }
708 p.addUnitNode(n)
709 }
710 p.addConcatenate()
711 }
712 }
713
714 return p.concatenation, nil
715}
716
717/*
718 * Scans $ patterns recognized within replacement patterns
719 */
720func (p *parser) scanDollar() (*regexNode, error) {
721 if p.charsRight() == 0 {
722 return newRegexNodeCh(ntOne, p.options, '$'), nil
723 }
724
725 ch := p.rightChar(0)
726 angled := false
727 backpos := p.textpos()
728 lastEndPos := backpos
729
730 // Note angle
731
732 if ch == '{' && p.charsRight() > 1 {
733 angled = true
734 p.moveRight(1)
735 ch = p.rightChar(0)
736 }
737
738 // Try to parse backreference: \1 or \{1} or \{cap}
739
740 if ch >= '0' && ch <= '9' {
741 if !angled && p.useOptionE() {
742 capnum := -1
743 newcapnum := int(ch - '0')
744 p.moveRight(1)
745 if p.isCaptureSlot(newcapnum) {
746 capnum = newcapnum
747 lastEndPos = p.textpos()
748 }
749
750 for p.charsRight() > 0 {
751 ch = p.rightChar(0)
752 if ch < '0' || ch > '9' {
753 break
754 }
755 digit := int(ch - '0')
756 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
757 return nil, p.getErr(ErrCaptureGroupOutOfRange)
758 }
759
760 newcapnum = newcapnum*10 + digit
761
762 p.moveRight(1)
763 if p.isCaptureSlot(newcapnum) {
764 capnum = newcapnum
765 lastEndPos = p.textpos()
766 }
767 }
768 p.textto(lastEndPos)
769 if capnum >= 0 {
770 return newRegexNodeM(ntRef, p.options, capnum), nil
771 }
772 } else {
773 capnum, err := p.scanDecimal()
774 if err != nil {
775 return nil, err
776 }
777 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
778 if p.isCaptureSlot(capnum) {
779 return newRegexNodeM(ntRef, p.options, capnum), nil
780 }
781 }
782 }
783 } else if angled && IsWordChar(ch) {
784 capname := p.scanCapname()
785
786 if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
787 if p.isCaptureName(capname) {
788 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
789 }
790 }
791 } else if !angled {
792 capnum := 1
793
794 switch ch {
795 case '$':
796 p.moveRight(1)
797 return newRegexNodeCh(ntOne, p.options, '$'), nil
798 case '&':
799 capnum = 0
800 case '`':
801 capnum = replaceLeftPortion
802 case '\'':
803 capnum = replaceRightPortion
804 case '+':
805 capnum = replaceLastGroup
806 case '_':
807 capnum = replaceWholeString
808 }
809
810 if capnum != 1 {
811 p.moveRight(1)
812 return newRegexNodeM(ntRef, p.options, capnum), nil
813 }
814 }
815
816 // unrecognized $: literalize
817
818 p.textto(backpos)
819 return newRegexNodeCh(ntOne, p.options, '$'), nil
820}
821
822// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
823// a RegexNode for the type of group scanned, or nil if the group
824// simply changed options (?cimsx-cimsx) or was a comment (#...).
825func (p *parser) scanGroupOpen() (*regexNode, error) {
826 var ch rune
827 var nt nodeType
828 var err error
829 close := '>'
830 start := p.textpos()
831
832 // just return a RegexNode if we have:
833 // 1. "(" followed by nothing
834 // 2. "(x" where x != ?
835 // 3. "(?)"
836 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
837 if p.useOptionN() || p.ignoreNextParen {
838 p.ignoreNextParen = false
839 return newRegexNode(ntGroup, p.options), nil
840 }
841 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
842 }
843
844 p.moveRight(1)
845
846 for {
847 if p.charsRight() == 0 {
848 break
849 }
850
851 switch ch = p.moveRightGetChar(); ch {
852 case ':':
853 nt = ntGroup
854
855 case '=':
856 p.options &= ^RightToLeft
857 nt = ntRequire
858
859 case '!':
860 p.options &= ^RightToLeft
861 nt = ntPrevent
862
863 case '>':
864 nt = ntGreedy
865
866 case '\'':
867 close = '\''
868 fallthrough
869
870 case '<':
871 if p.charsRight() == 0 {
872 goto BreakRecognize
873 }
874
875 switch ch = p.moveRightGetChar(); ch {
876 case '=':
877 if close == '\'' {
878 goto BreakRecognize
879 }
880
881 p.options |= RightToLeft
882 nt = ntRequire
883
884 case '!':
885 if close == '\'' {
886 goto BreakRecognize
887 }
888
889 p.options |= RightToLeft
890 nt = ntPrevent
891
892 default:
893 p.moveLeft()
894 capnum := -1
895 uncapnum := -1
896 proceed := false
897
898 // grab part before -
899
900 if ch >= '0' && ch <= '9' {
901 if capnum, err = p.scanDecimal(); err != nil {
902 return nil, err
903 }
904
905 if !p.isCaptureSlot(capnum) {
906 capnum = -1
907 }
908
909 // check if we have bogus characters after the number
910 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
911 return nil, p.getErr(ErrInvalidGroupName)
912 }
913 if capnum == 0 {
914 return nil, p.getErr(ErrCapNumNotZero)
915 }
916 } else if IsWordChar(ch) {
917 capname := p.scanCapname()
918
919 if p.isCaptureName(capname) {
920 capnum = p.captureSlotFromName(capname)
921 }
922
923 // check if we have bogus character after the name
924 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
925 return nil, p.getErr(ErrInvalidGroupName)
926 }
927 } else if ch == '-' {
928 proceed = true
929 } else {
930 // bad group name - starts with something other than a word character and isn't a number
931 return nil, p.getErr(ErrInvalidGroupName)
932 }
933
934 // grab part after - if any
935
936 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
937 p.moveRight(1)
938
939 //no more chars left, no closing char, etc
940 if p.charsRight() == 0 {
941 return nil, p.getErr(ErrInvalidGroupName)
942 }
943
944 ch = p.rightChar(0)
945 if ch >= '0' && ch <= '9' {
946 if uncapnum, err = p.scanDecimal(); err != nil {
947 return nil, err
948 }
949
950 if !p.isCaptureSlot(uncapnum) {
951 return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
952 }
953
954 // check if we have bogus characters after the number
955 if p.charsRight() > 0 && p.rightChar(0) != close {
956 return nil, p.getErr(ErrInvalidGroupName)
957 }
958 } else if IsWordChar(ch) {
959 uncapname := p.scanCapname()
960
961 if !p.isCaptureName(uncapname) {
962 return nil, p.getErr(ErrUndefinedNameRef, uncapname)
963 }
964 uncapnum = p.captureSlotFromName(uncapname)
965
966 // check if we have bogus character after the name
967 if p.charsRight() > 0 && p.rightChar(0) != close {
968 return nil, p.getErr(ErrInvalidGroupName)
969 }
970 } else {
971 // bad group name - starts with something other than a word character and isn't a number
972 return nil, p.getErr(ErrInvalidGroupName)
973 }
974 }
975
976 // actually make the node
977
978 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
979 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
980 }
981 goto BreakRecognize
982 }
983
984 case '(':
985 // alternation construct (?(...) | )
986
987 parenPos := p.textpos()
988 if p.charsRight() > 0 {
989 ch = p.rightChar(0)
990
991 // check if the alternation condition is a backref
992 if ch >= '0' && ch <= '9' {
993 var capnum int
994 if capnum, err = p.scanDecimal(); err != nil {
995 return nil, err
996 }
997 if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
998 if p.isCaptureSlot(capnum) {
999 return newRegexNodeM(ntTestref, p.options, capnum), nil
1000 }
1001 return nil, p.getErr(ErrUndefinedReference, capnum)
1002 }
1003
1004 return nil, p.getErr(ErrMalformedReference, capnum)
1005
1006 } else if IsWordChar(ch) {
1007 capname := p.scanCapname()
1008
1009 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1010 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1011 }
1012 }
1013 }
1014 // not a backref
1015 nt = ntTestgroup
1016 p.textto(parenPos - 1) // jump to the start of the parentheses
1017 p.ignoreNextParen = true // but make sure we don't try to capture the insides
1018
1019 charsRight := p.charsRight()
1020 if charsRight >= 3 && p.rightChar(1) == '?' {
1021 rightchar2 := p.rightChar(2)
1022 // disallow comments in the condition
1023 if rightchar2 == '#' {
1024 return nil, p.getErr(ErrAlternationCantHaveComment)
1025 }
1026
1027 // disallow named capture group (?<..>..) in the condition
1028 if rightchar2 == '\'' {
1029 return nil, p.getErr(ErrAlternationCantCapture)
1030 }
1031
1032 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1033 return nil, p.getErr(ErrAlternationCantCapture)
1034 }
1035 }
1036
1037 case 'P':
1038 if p.useRE2() {
1039 // support for P<name> syntax
1040 if p.charsRight() < 3 {
1041 goto BreakRecognize
1042 }
1043
1044 ch = p.moveRightGetChar()
1045 if ch != '<' {
1046 goto BreakRecognize
1047 }
1048
1049 ch = p.moveRightGetChar()
1050 p.moveLeft()
1051
1052 if IsWordChar(ch) {
1053 capnum := -1
1054 capname := p.scanCapname()
1055
1056 if p.isCaptureName(capname) {
1057 capnum = p.captureSlotFromName(capname)
1058 }
1059
1060 // check if we have bogus character after the name
1061 if p.charsRight() > 0 && p.rightChar(0) != '>' {
1062 return nil, p.getErr(ErrInvalidGroupName)
1063 }
1064
1065 // actually make the node
1066
1067 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1068 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1069 }
1070 goto BreakRecognize
1071
1072 } else {
1073 // bad group name - starts with something other than a word character and isn't a number
1074 return nil, p.getErr(ErrInvalidGroupName)
1075 }
1076 }
1077 // if we're not using RE2 compat mode then
1078 // we just behave like normal
1079 fallthrough
1080
1081 default:
1082 p.moveLeft()
1083
1084 nt = ntGroup
1085 // disallow options in the children of a testgroup node
1086 if p.group.t != ntTestgroup {
1087 p.scanOptions()
1088 }
1089 if p.charsRight() == 0 {
1090 goto BreakRecognize
1091 }
1092
1093 if ch = p.moveRightGetChar(); ch == ')' {
1094 return nil, nil
1095 }
1096
1097 if ch != ':' {
1098 goto BreakRecognize
1099 }
1100
1101 }
1102
1103 return newRegexNode(nt, p.options), nil
1104 }
1105
1106BreakRecognize:
1107
1108 // break Recognize comes here
1109
1110 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1111}
1112
1113// scans backslash specials and basics
1114func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1115
1116 if p.charsRight() == 0 {
1117 return nil, p.getErr(ErrIllegalEndEscape)
1118 }
1119
1120 switch ch := p.rightChar(0); ch {
1121 case 'b', 'B', 'A', 'G', 'Z', 'z':
1122 p.moveRight(1)
1123 return newRegexNode(p.typeFromCode(ch), p.options), nil
1124
1125 case 'w':
1126 p.moveRight(1)
1127 if p.useOptionE() || p.useRE2() {
1128 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1129 }
1130 return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1131
1132 case 'W':
1133 p.moveRight(1)
1134 if p.useOptionE() || p.useRE2() {
1135 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1136 }
1137 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1138
1139 case 's':
1140 p.moveRight(1)
1141 if p.useOptionE() {
1142 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1143 } else if p.useRE2() {
1144 return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
1145 }
1146 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1147
1148 case 'S':
1149 p.moveRight(1)
1150 if p.useOptionE() {
1151 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1152 } else if p.useRE2() {
1153 return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
1154 }
1155 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1156
1157 case 'd':
1158 p.moveRight(1)
1159 if p.useOptionE() || p.useRE2() {
1160 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1161 }
1162 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1163
1164 case 'D':
1165 p.moveRight(1)
1166 if p.useOptionE() || p.useRE2() {
1167 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1168 }
1169 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1170
1171 case 'p', 'P':
1172 p.moveRight(1)
1173 prop, err := p.parseProperty()
1174 if err != nil {
1175 return nil, err
1176 }
1177 cc := &CharSet{}
1178 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1179 if p.useOptionI() {
1180 cc.addLowercase()
1181 }
1182
1183 return newRegexNodeSet(ntSet, p.options, cc), nil
1184
1185 default:
1186 return p.scanBasicBackslash(scanOnly)
1187 }
1188}
1189
1190// Scans \-style backreferences and character escapes
1191func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1192 if p.charsRight() == 0 {
1193 return nil, p.getErr(ErrIllegalEndEscape)
1194 }
1195 angled := false
1196 k := false
1197 close := '\x00'
1198
1199 backpos := p.textpos()
1200 ch := p.rightChar(0)
1201
1202 // Allow \k<foo> instead of \<foo>, which is now deprecated.
1203
1204 // According to ECMAScript specification, \k<name> is only parsed as a named group reference if
1205 // there is at least one group name in the regexp.
1206 // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
1207 // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
1208 if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
1209 if p.charsRight() >= 2 {
1210 p.moveRight(1)
1211 ch = p.moveRightGetChar()
1212
1213 if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
1214 angled = true
1215 if ch == '\'' {
1216 close = '\''
1217 } else {
1218 close = '>'
1219 }
1220 }
1221 }
1222
1223 if !angled || p.charsRight() <= 0 {
1224 return nil, p.getErr(ErrMalformedNameRef)
1225 }
1226
1227 ch = p.rightChar(0)
1228 k = true
1229
1230 } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1231 angled = true
1232 if ch == '\'' {
1233 close = '\''
1234 } else {
1235 close = '>'
1236 }
1237
1238 p.moveRight(1)
1239 ch = p.rightChar(0)
1240 }
1241
1242 // Try to parse backreference: \<1> or \<cap>
1243
1244 if angled && ch >= '0' && ch <= '9' {
1245 capnum, err := p.scanDecimal()
1246 if err != nil {
1247 return nil, err
1248 }
1249
1250 if p.charsRight() > 0 && p.moveRightGetChar() == close {
1251 if p.isCaptureSlot(capnum) {
1252 return newRegexNodeM(ntRef, p.options, capnum), nil
1253 }
1254 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1255 }
1256 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1257 capnum, err := p.scanDecimal()
1258 if err != nil {
1259 return nil, err
1260 }
1261
1262 if scanOnly {
1263 return nil, nil
1264 }
1265
1266 if p.isCaptureSlot(capnum) {
1267 return newRegexNodeM(ntRef, p.options, capnum), nil
1268 }
1269 if capnum <= 9 && !p.useOptionE() {
1270 return nil, p.getErr(ErrUndefinedBackRef, capnum)
1271 }
1272
1273 } else if angled {
1274 capname := p.scanCapname()
1275
1276 if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
1277
1278 if scanOnly {
1279 return nil, nil
1280 }
1281
1282 if p.isCaptureName(capname) {
1283 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1284 }
1285 return nil, p.getErr(ErrUndefinedNameRef, capname)
1286 } else {
1287 if k {
1288 return nil, p.getErr(ErrMalformedNameRef)
1289 }
1290 }
1291 }
1292
1293 // Not backreference: must be char code
1294
1295 p.textto(backpos)
1296 ch, err := p.scanCharEscape()
1297 if err != nil {
1298 return nil, err
1299 }
1300
1301 if scanOnly {
1302 return nil, nil
1303 }
1304
1305 if p.useOptionI() {
1306 ch = unicode.ToLower(ch)
1307 }
1308
1309 return newRegexNodeCh(ntOne, p.options, ch), nil
1310}
1311
1312// Scans X for \p{X} or \P{X}
1313func (p *parser) parseProperty() (string, error) {
1314 if p.charsRight() < 3 {
1315 return "", p.getErr(ErrIncompleteSlashP)
1316 }
1317 ch := p.moveRightGetChar()
1318 if ch != '{' {
1319 return "", p.getErr(ErrMalformedSlashP)
1320 }
1321
1322 startpos := p.textpos()
1323 for p.charsRight() > 0 {
1324 ch = p.moveRightGetChar()
1325 if !(IsWordChar(ch) || ch == '-') {
1326 p.moveLeft()
1327 break
1328 }
1329 }
1330 capname := string(p.pattern[startpos:p.textpos()])
1331
1332 if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1333 return "", p.getErr(ErrIncompleteSlashP)
1334 }
1335
1336 if !isValidUnicodeCat(capname) {
1337 return "", p.getErr(ErrUnknownSlashP, capname)
1338 }
1339
1340 return capname, nil
1341}
1342
1343// Returns ReNode type for zero-length assertions with a \ code.
1344func (p *parser) typeFromCode(ch rune) nodeType {
1345 switch ch {
1346 case 'b':
1347 if p.useOptionE() {
1348 return ntECMABoundary
1349 }
1350 return ntBoundary
1351 case 'B':
1352 if p.useOptionE() {
1353 return ntNonECMABoundary
1354 }
1355 return ntNonboundary
1356 case 'A':
1357 return ntBeginning
1358 case 'G':
1359 return ntStart
1360 case 'Z':
1361 return ntEndZ
1362 case 'z':
1363 return ntEnd
1364 default:
1365 return ntNothing
1366 }
1367}
1368
1369// Scans whitespace or x-mode comments.
1370func (p *parser) scanBlank() error {
1371 if p.useOptionX() {
1372 for {
1373 for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1374 p.moveRight(1)
1375 }
1376
1377 if p.charsRight() == 0 {
1378 break
1379 }
1380
1381 if p.rightChar(0) == '#' {
1382 for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1383 p.moveRight(1)
1384 }
1385 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1386 p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1387 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1388 p.moveRight(1)
1389 }
1390 if p.charsRight() == 0 {
1391 return p.getErr(ErrUnterminatedComment)
1392 }
1393 p.moveRight(1)
1394 } else {
1395 break
1396 }
1397 }
1398 } else {
1399 for {
1400 if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1401 p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1402 return nil
1403 }
1404
1405 for p.charsRight() > 0 && p.rightChar(0) != ')' {
1406 p.moveRight(1)
1407 }
1408 if p.charsRight() == 0 {
1409 return p.getErr(ErrUnterminatedComment)
1410 }
1411 p.moveRight(1)
1412 }
1413 }
1414 return nil
1415}
1416
1417func (p *parser) scanCapname() string {
1418 startpos := p.textpos()
1419
1420 for p.charsRight() > 0 {
1421 if !IsWordChar(p.moveRightGetChar()) {
1422 p.moveLeft()
1423 break
1424 }
1425 }
1426
1427 return string(p.pattern[startpos:p.textpos()])
1428}
1429
1430//Scans contents of [] (not including []'s), and converts to a set.
1431func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1432 ch := '\x00'
1433 chPrev := '\x00'
1434 inRange := false
1435 firstChar := true
1436 closed := false
1437
1438 var cc *CharSet
1439 if !scanOnly {
1440 cc = &CharSet{}
1441 }
1442
1443 if p.charsRight() > 0 && p.rightChar(0) == '^' {
1444 p.moveRight(1)
1445 if !scanOnly {
1446 cc.negate = true
1447 }
1448 }
1449
1450 for ; p.charsRight() > 0; firstChar = false {
1451 fTranslatedChar := false
1452 ch = p.moveRightGetChar()
1453 if ch == ']' {
1454 if !firstChar {
1455 closed = true
1456 break
1457 } else if p.useOptionE() {
1458 if !scanOnly {
1459 cc.addRanges(NoneClass().ranges)
1460 }
1461 closed = true
1462 break
1463 }
1464
1465 } else if ch == '\\' && p.charsRight() > 0 {
1466 switch ch = p.moveRightGetChar(); ch {
1467 case 'D', 'd':
1468 if !scanOnly {
1469 if inRange {
1470 return nil, p.getErr(ErrBadClassInCharRange, ch)
1471 }
1472 cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
1473 }
1474 continue
1475
1476 case 'S', 's':
1477 if !scanOnly {
1478 if inRange {
1479 return nil, p.getErr(ErrBadClassInCharRange, ch)
1480 }
1481 cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
1482 }
1483 continue
1484
1485 case 'W', 'w':
1486 if !scanOnly {
1487 if inRange {
1488 return nil, p.getErr(ErrBadClassInCharRange, ch)
1489 }
1490
1491 cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
1492 }
1493 continue
1494
1495 case 'p', 'P':
1496 if !scanOnly {
1497 if inRange {
1498 return nil, p.getErr(ErrBadClassInCharRange, ch)
1499 }
1500 prop, err := p.parseProperty()
1501 if err != nil {
1502 return nil, err
1503 }
1504 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1505 } else {
1506 p.parseProperty()
1507 }
1508
1509 continue
1510
1511 case '-':
1512 if !scanOnly {
1513 cc.addRange(ch, ch)
1514 }
1515 continue
1516
1517 default:
1518 p.moveLeft()
1519 var err error
1520 ch, err = p.scanCharEscape() // non-literal character
1521 if err != nil {
1522 return nil, err
1523 }
1524 fTranslatedChar = true
1525 break // this break will only break out of the switch
1526 }
1527 } else if ch == '[' {
1528 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1529 // It currently doesn't do anything other than skip the whole thing!
1530 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1531 savePos := p.textpos()
1532
1533 p.moveRight(1)
1534 negate := false
1535 if p.charsRight() > 1 && p.rightChar(0) == '^' {
1536 negate = true
1537 p.moveRight(1)
1538 }
1539
1540 nm := p.scanCapname() // snag the name
1541 if !scanOnly && p.useRE2() {
1542 // look up the name since these are valid for RE2
1543 // add the group based on the name
1544 if ok := cc.addNamedASCII(nm, negate); !ok {
1545 return nil, p.getErr(ErrInvalidCharRange)
1546 }
1547 }
1548 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1549 p.textto(savePos)
1550 } else if p.useRE2() {
1551 // move on
1552 continue
1553 }
1554 }
1555 }
1556
1557 if inRange {
1558 inRange = false
1559 if !scanOnly {
1560 if ch == '[' && !fTranslatedChar && !firstChar {
1561 // We thought we were in a range, but we're actually starting a subtraction.
1562 // In that case, we'll add chPrev to our char class, skip the opening [, and
1563 // scan the new character class recursively.
1564 cc.addChar(chPrev)
1565 sub, err := p.scanCharSet(caseInsensitive, false)
1566 if err != nil {
1567 return nil, err
1568 }
1569 cc.addSubtraction(sub)
1570
1571 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1572 return nil, p.getErr(ErrSubtractionMustBeLast)
1573 }
1574 } else {
1575 // a regular range, like a-z
1576 if chPrev > ch {
1577 return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
1578 }
1579 cc.addRange(chPrev, ch)
1580 }
1581 }
1582 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1583 // this could be the start of a range
1584 chPrev = ch
1585 inRange = true
1586 p.moveRight(1)
1587 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1588 // we aren't in a range, and now there is a subtraction. Usually this happens
1589 // only when a subtraction follows a range, like [a-z-[b]]
1590 if !scanOnly {
1591 p.moveRight(1)
1592 sub, err := p.scanCharSet(caseInsensitive, false)
1593 if err != nil {
1594 return nil, err
1595 }
1596 cc.addSubtraction(sub)
1597
1598 if p.charsRight() > 0 && p.rightChar(0) != ']' {
1599 return nil, p.getErr(ErrSubtractionMustBeLast)
1600 }
1601 } else {
1602 p.moveRight(1)
1603 p.scanCharSet(caseInsensitive, true)
1604 }
1605 } else {
1606 if !scanOnly {
1607 cc.addRange(ch, ch)
1608 }
1609 }
1610 }
1611
1612 if !closed {
1613 return nil, p.getErr(ErrUnterminatedBracket)
1614 }
1615
1616 if !scanOnly && caseInsensitive {
1617 cc.addLowercase()
1618 }
1619
1620 return cc, nil
1621}
1622
1623// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1624func (p *parser) scanDecimal() (int, error) {
1625 i := 0
1626 var d int
1627
1628 for p.charsRight() > 0 {
1629 d = int(p.rightChar(0) - '0')
1630 if d < 0 || d > 9 {
1631 break
1632 }
1633 p.moveRight(1)
1634
1635 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1636 return 0, p.getErr(ErrCaptureGroupOutOfRange)
1637 }
1638
1639 i *= 10
1640 i += d
1641 }
1642
1643 return int(i), nil
1644}
1645
1646// Returns true for options allowed only at the top level
1647func isOnlyTopOption(option RegexOptions) bool {
1648 return option == RightToLeft || option == ECMAScript || option == RE2
1649}
1650
1651// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1652func (p *parser) scanOptions() {
1653
1654 for off := false; p.charsRight() > 0; p.moveRight(1) {
1655 ch := p.rightChar(0)
1656
1657 if ch == '-' {
1658 off = true
1659 } else if ch == '+' {
1660 off = false
1661 } else {
1662 option := optionFromCode(ch)
1663 if option == 0 || isOnlyTopOption(option) {
1664 return
1665 }
1666
1667 if off {
1668 p.options &= ^option
1669 } else {
1670 p.options |= option
1671 }
1672 }
1673 }
1674}
1675
1676// Scans \ code for escape codes that map to single unicode chars.
1677func (p *parser) scanCharEscape() (r rune, err error) {
1678
1679 ch := p.moveRightGetChar()
1680
1681 if ch >= '0' && ch <= '7' {
1682 p.moveLeft()
1683 return p.scanOctal(), nil
1684 }
1685
1686 pos := p.textpos()
1687
1688 switch ch {
1689 case 'x':
1690 // support for \x{HEX} syntax from Perl and PCRE
1691 if p.charsRight() > 0 && p.rightChar(0) == '{' {
1692 if p.useOptionE() {
1693 return ch, nil
1694 }
1695 p.moveRight(1)
1696 return p.scanHexUntilBrace()
1697 } else {
1698 r, err = p.scanHex(2)
1699 }
1700 case 'u':
1701 // ECMAscript suppot \u{HEX} only if `u` is also set
1702 if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
1703 p.moveRight(1)
1704 return p.scanHexUntilBrace()
1705 } else {
1706 r, err = p.scanHex(4)
1707 }
1708 case 'a':
1709 return '\u0007', nil
1710 case 'b':
1711 return '\b', nil
1712 case 'e':
1713 return '\u001B', nil
1714 case 'f':
1715 return '\f', nil
1716 case 'n':
1717 return '\n', nil
1718 case 'r':
1719 return '\r', nil
1720 case 't':
1721 return '\t', nil
1722 case 'v':
1723 return '\u000B', nil
1724 case 'c':
1725 r, err = p.scanControl()
1726 default:
1727 if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
1728 return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1729 }
1730 return ch, nil
1731 }
1732 if err != nil && p.useOptionE() {
1733 p.textto(pos)
1734 return ch, nil
1735 }
1736 return
1737}
1738
1739// Grabs and converts an ascii control character
1740func (p *parser) scanControl() (rune, error) {
1741 if p.charsRight() <= 0 {
1742 return 0, p.getErr(ErrMissingControl)
1743 }
1744
1745 ch := p.moveRightGetChar()
1746
1747 // \ca interpreted as \cA
1748
1749 if ch >= 'a' && ch <= 'z' {
1750 ch = (ch - ('a' - 'A'))
1751 }
1752 ch = (ch - '@')
1753 if ch >= 0 && ch < ' ' {
1754 return ch, nil
1755 }
1756
1757 return 0, p.getErr(ErrUnrecognizedControl)
1758
1759}
1760
1761// Scan hex digits until we hit a closing brace.
1762// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1763func (p *parser) scanHexUntilBrace() (rune, error) {
1764 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1765 // so we can enforce that
1766 i := 0
1767 hasContent := false
1768
1769 for p.charsRight() > 0 {
1770 ch := p.moveRightGetChar()
1771 if ch == '}' {
1772 // hit our close brace, we're done here
1773 // prevent \x{}
1774 if !hasContent {
1775 return 0, p.getErr(ErrTooFewHex)
1776 }
1777 return rune(i), nil
1778 }
1779 hasContent = true
1780 // no brace needs to be hex digit
1781 d := hexDigit(ch)
1782 if d < 0 {
1783 return 0, p.getErr(ErrMissingBrace)
1784 }
1785
1786 i *= 0x10
1787 i += d
1788
1789 if i > unicode.MaxRune {
1790 return 0, p.getErr(ErrInvalidHex)
1791 }
1792 }
1793
1794 // we only make it here if we run out of digits without finding the brace
1795 return 0, p.getErr(ErrMissingBrace)
1796}
1797
1798// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1799func (p *parser) scanHex(c int) (rune, error) {
1800
1801 i := 0
1802
1803 if p.charsRight() >= c {
1804 for c > 0 {
1805 d := hexDigit(p.moveRightGetChar())
1806 if d < 0 {
1807 break
1808 }
1809 i *= 0x10
1810 i += d
1811 c--
1812 }
1813 }
1814
1815 if c > 0 {
1816 return 0, p.getErr(ErrTooFewHex)
1817 }
1818
1819 return rune(i), nil
1820}
1821
1822// Returns n <= 0xF for a hex digit.
1823func hexDigit(ch rune) int {
1824
1825 if d := uint(ch - '0'); d <= 9 {
1826 return int(d)
1827 }
1828
1829 if d := uint(ch - 'a'); d <= 5 {
1830 return int(d + 0xa)
1831 }
1832
1833 if d := uint(ch - 'A'); d <= 5 {
1834 return int(d + 0xa)
1835 }
1836
1837 return -1
1838}
1839
1840// Scans up to three octal digits (stops before exceeding 0377).
1841func (p *parser) scanOctal() rune {
1842 // Consume octal chars only up to 3 digits and value 0377
1843
1844 c := 3
1845
1846 if c > p.charsRight() {
1847 c = p.charsRight()
1848 }
1849
1850 //we know the first char is good because the caller had to check
1851 i := 0
1852 d := int(p.rightChar(0) - '0')
1853 for c > 0 && d <= 7 && d >= 0 {
1854 if i >= 0x20 && p.useOptionE() {
1855 break
1856 }
1857 i *= 8
1858 i += d
1859 c--
1860
1861 p.moveRight(1)
1862 if !p.rightMost() {
1863 d = int(p.rightChar(0) - '0')
1864 }
1865 }
1866
1867 // Octal codes only go up to 255. Any larger and the behavior that Perl follows
1868 // is simply to truncate the high bits.
1869 i &= 0xFF
1870
1871 return rune(i)
1872}
1873
1874// Returns the current parsing position.
1875func (p *parser) textpos() int {
1876 return p.currentPos
1877}
1878
1879// Zaps to a specific parsing position.
1880func (p *parser) textto(pos int) {
1881 p.currentPos = pos
1882}
1883
1884// Returns the char at the right of the current parsing position and advances to the right.
1885func (p *parser) moveRightGetChar() rune {
1886 ch := p.pattern[p.currentPos]
1887 p.currentPos++
1888 return ch
1889}
1890
1891// Moves the current position to the right.
1892func (p *parser) moveRight(i int) {
1893 // default would be 1
1894 p.currentPos += i
1895}
1896
1897// Moves the current parsing position one to the left.
1898func (p *parser) moveLeft() {
1899 p.currentPos--
1900}
1901
1902// Returns the char left of the current parsing position.
1903func (p *parser) charAt(i int) rune {
1904 return p.pattern[i]
1905}
1906
1907// Returns the char i chars right of the current parsing position.
1908func (p *parser) rightChar(i int) rune {
1909 // default would be 0
1910 return p.pattern[p.currentPos+i]
1911}
1912
1913// Number of characters to the right of the current parsing position.
1914func (p *parser) charsRight() int {
1915 return len(p.pattern) - p.currentPos
1916}
1917
1918func (p *parser) rightMost() bool {
1919 return p.currentPos == len(p.pattern)
1920}
1921
1922// Looks up the slot number for a given name
1923func (p *parser) captureSlotFromName(capname string) int {
1924 return p.capnames[capname]
1925}
1926
1927// True if the capture slot was noted
1928func (p *parser) isCaptureSlot(i int) bool {
1929 if p.caps != nil {
1930 _, ok := p.caps[i]
1931 return ok
1932 }
1933
1934 return (i >= 0 && i < p.capsize)
1935}
1936
1937// Looks up the slot number for a given name
1938func (p *parser) isCaptureName(capname string) bool {
1939 if p.capnames == nil {
1940 return false
1941 }
1942
1943 _, ok := p.capnames[capname]
1944 return ok
1945}
1946
1947// option shortcuts
1948
1949// True if N option disabling '(' autocapture is on.
1950func (p *parser) useOptionN() bool {
1951 return (p.options & ExplicitCapture) != 0
1952}
1953
1954// True if I option enabling case-insensitivity is on.
1955func (p *parser) useOptionI() bool {
1956 return (p.options & IgnoreCase) != 0
1957}
1958
1959// True if M option altering meaning of $ and ^ is on.
1960func (p *parser) useOptionM() bool {
1961 return (p.options & Multiline) != 0
1962}
1963
1964// True if S option altering meaning of . is on.
1965func (p *parser) useOptionS() bool {
1966 return (p.options & Singleline) != 0
1967}
1968
1969// True if X option enabling whitespace/comment mode is on.
1970func (p *parser) useOptionX() bool {
1971 return (p.options & IgnorePatternWhitespace) != 0
1972}
1973
1974// True if E option enabling ECMAScript behavior on.
1975func (p *parser) useOptionE() bool {
1976 return (p.options & ECMAScript) != 0
1977}
1978
1979// true to use RE2 compatibility parsing behavior.
1980func (p *parser) useRE2() bool {
1981 return (p.options & RE2) != 0
1982}
1983
1984// True if U option enabling ECMAScript's Unicode behavior on.
1985func (p *parser) useOptionU() bool {
1986 return (p.options & Unicode) != 0
1987}
1988
1989// True if options stack is empty.
1990func (p *parser) emptyOptionsStack() bool {
1991 return len(p.optionsStack) == 0
1992}
1993
1994// Finish the current quantifiable (when a quantifier is not found or is not possible)
1995func (p *parser) addConcatenate() {
1996 // The first (| inside a Testgroup group goes directly to the group
1997 p.concatenation.addChild(p.unit)
1998 p.unit = nil
1999}
2000
2001// Finish the current quantifiable (when a quantifier is found)
2002func (p *parser) addConcatenate3(lazy bool, min, max int) {
2003 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
2004 p.unit = nil
2005}
2006
2007// Sets the current unit to a single char node
2008func (p *parser) addUnitOne(ch rune) {
2009 if p.useOptionI() {
2010 ch = unicode.ToLower(ch)
2011 }
2012
2013 p.unit = newRegexNodeCh(ntOne, p.options, ch)
2014}
2015
2016// Sets the current unit to a single inverse-char node
2017func (p *parser) addUnitNotone(ch rune) {
2018 if p.useOptionI() {
2019 ch = unicode.ToLower(ch)
2020 }
2021
2022 p.unit = newRegexNodeCh(ntNotone, p.options, ch)
2023}
2024
2025// Sets the current unit to a single set node
2026func (p *parser) addUnitSet(set *CharSet) {
2027 p.unit = newRegexNodeSet(ntSet, p.options, set)
2028}
2029
2030// Sets the current unit to a subtree
2031func (p *parser) addUnitNode(node *regexNode) {
2032 p.unit = node
2033}
2034
2035// Sets the current unit to an assertion of the specified type
2036func (p *parser) addUnitType(t nodeType) {
2037 p.unit = newRegexNode(t, p.options)
2038}
2039
2040// Finish the current group (in response to a ')' or end)
2041func (p *parser) addGroup() error {
2042 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2043 p.group.addChild(p.concatenation.reverseLeft())
2044 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
2045 return p.getErr(ErrTooManyAlternates)
2046 }
2047 } else {
2048 p.alternation.addChild(p.concatenation.reverseLeft())
2049 p.group.addChild(p.alternation)
2050 }
2051
2052 p.unit = p.group
2053 return nil
2054}
2055
2056// Pops the option stack, but keeps the current options unchanged.
2057func (p *parser) popKeepOptions() {
2058 lastIdx := len(p.optionsStack) - 1
2059 p.optionsStack = p.optionsStack[:lastIdx]
2060}
2061
2062// Recalls options from the stack.
2063func (p *parser) popOptions() {
2064 lastIdx := len(p.optionsStack) - 1
2065 // get the last item on the stack and then remove it by reslicing
2066 p.options = p.optionsStack[lastIdx]
2067 p.optionsStack = p.optionsStack[:lastIdx]
2068}
2069
2070// Saves options on a stack.
2071func (p *parser) pushOptions() {
2072 p.optionsStack = append(p.optionsStack, p.options)
2073}
2074
2075// Add a string to the last concatenate.
2076func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2077 var node *regexNode
2078
2079 if cch == 0 {
2080 return
2081 }
2082
2083 if cch > 1 {
2084 str := make([]rune, cch)
2085 copy(str, p.pattern[pos:pos+cch])
2086
2087 if p.useOptionI() && !isReplacement {
2088 // We do the ToLower character by character for consistency. With surrogate chars, doing
2089 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
2090 // linguistically, but since Regex doesn't support surrogates, it's more important to be
2091 // consistent.
2092 for i := 0; i < len(str); i++ {
2093 str[i] = unicode.ToLower(str[i])
2094 }
2095 }
2096
2097 node = newRegexNodeStr(ntMulti, p.options, str)
2098 } else {
2099 ch := p.charAt(pos)
2100
2101 if p.useOptionI() && !isReplacement {
2102 ch = unicode.ToLower(ch)
2103 }
2104
2105 node = newRegexNodeCh(ntOne, p.options, ch)
2106 }
2107
2108 p.concatenation.addChild(node)
2109}
2110
2111// Push the parser state (in response to an open paren)
2112func (p *parser) pushGroup() {
2113 p.group.next = p.stack
2114 p.alternation.next = p.group
2115 p.concatenation.next = p.alternation
2116 p.stack = p.concatenation
2117}
2118
2119// Remember the pushed state (in response to a ')')
2120func (p *parser) popGroup() error {
2121 p.concatenation = p.stack
2122 p.alternation = p.concatenation.next
2123 p.group = p.alternation.next
2124 p.stack = p.group.next
2125
2126 // The first () inside a Testgroup group goes directly to the group
2127 if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2128 if p.unit == nil {
2129 return p.getErr(ErrConditionalExpression)
2130 }
2131
2132 p.group.addChild(p.unit)
2133 p.unit = nil
2134 }
2135 return nil
2136}
2137
2138// True if the group stack is empty.
2139func (p *parser) emptyStack() bool {
2140 return p.stack == nil
2141}
2142
2143// Start a new round for the parser state (in response to an open paren or string start)
2144func (p *parser) startGroup(openGroup *regexNode) {
2145 p.group = openGroup
2146 p.alternation = newRegexNode(ntAlternate, p.options)
2147 p.concatenation = newRegexNode(ntConcatenate, p.options)
2148}
2149
2150// Finish the current concatenation (in response to a |)
2151func (p *parser) addAlternate() {
2152 // The | parts inside a Testgroup group go directly to the group
2153
2154 if p.group.t == ntTestgroup || p.group.t == ntTestref {
2155 p.group.addChild(p.concatenation.reverseLeft())
2156 } else {
2157 p.alternation.addChild(p.concatenation.reverseLeft())
2158 }
2159
2160 p.concatenation = newRegexNode(ntConcatenate, p.options)
2161}
2162
2163// For categorizing ascii characters.
2164
2165const (
2166 Q byte = 5 // quantifier
2167 S = 4 // ordinary stopper
2168 Z = 3 // ScanBlank stopper
2169 X = 2 // whitespace
2170 E = 1 // should be escaped
2171)
2172
2173var _category = []byte{
2174 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F
2175 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2176 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
2177 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2178 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
2179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2180 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
2181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2182}
2183
2184func isSpace(ch rune) bool {
2185 return (ch <= ' ' && _category[ch] == X)
2186}
2187
2188// Returns true for those characters that terminate a string of ordinary chars.
2189func isSpecial(ch rune) bool {
2190 return (ch <= '|' && _category[ch] >= S)
2191}
2192
2193// Returns true for those characters that terminate a string of ordinary chars.
2194func isStopperX(ch rune) bool {
2195 return (ch <= '|' && _category[ch] >= X)
2196}
2197
2198// Returns true for those characters that begin a quantifier.
2199func isQuantifier(ch rune) bool {
2200 return (ch <= '{' && _category[ch] >= Q)
2201}
2202
2203func (p *parser) isTrueQuantifier() bool {
2204 nChars := p.charsRight()
2205 if nChars == 0 {
2206 return false
2207 }
2208
2209 startpos := p.textpos()
2210 ch := p.charAt(startpos)
2211 if ch != '{' {
2212 return ch <= '{' && _category[ch] >= Q
2213 }
2214
2215 //UGLY: this is ugly -- the original code was ugly too
2216 pos := startpos
2217 for {
2218 nChars--
2219 if nChars <= 0 {
2220 break
2221 }
2222 pos++
2223 ch = p.charAt(pos)
2224 if ch < '0' || ch > '9' {
2225 break
2226 }
2227 }
2228
2229 if nChars == 0 || pos-startpos == 1 {
2230 return false
2231 }
2232 if ch == '}' {
2233 return true
2234 }
2235 if ch != ',' {
2236 return false
2237 }
2238 for {
2239 nChars--
2240 if nChars <= 0 {
2241 break
2242 }
2243 pos++
2244 ch = p.charAt(pos)
2245 if ch < '0' || ch > '9' {
2246 break
2247 }
2248 }
2249
2250 return nChars > 0 && ch == '}'
2251}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/prefix.go b/vendor/github.com/dlclark/regexp2/syntax/prefix.go
new file mode 100644
index 0000000..f671688
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/prefix.go
@@ -0,0 +1,896 @@
1package syntax
2
3import (
4 "bytes"
5 "fmt"
6 "strconv"
7 "unicode"
8 "unicode/utf8"
9)
10
11type Prefix struct {
12 PrefixStr []rune
13 PrefixSet CharSet
14 CaseInsensitive bool
15}
16
17// It takes a RegexTree and computes the set of chars that can start it.
18func getFirstCharsPrefix(tree *RegexTree) *Prefix {
19 s := regexFcd{
20 fcStack: make([]regexFc, 32),
21 intStack: make([]int, 32),
22 }
23 fc := s.regexFCFromRegexTree(tree)
24
25 if fc == nil || fc.nullable || fc.cc.IsEmpty() {
26 return nil
27 }
28 fcSet := fc.getFirstChars()
29 return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive}
30}
31
32type regexFcd struct {
33 intStack []int
34 intDepth int
35 fcStack []regexFc
36 fcDepth int
37 skipAllChildren bool // don't process any more children at the current level
38 skipchild bool // don't process the current child.
39 failed bool
40}
41
42/*
43 * The main FC computation. It does a shortcutted depth-first walk
44 * through the tree and calls CalculateFC to emits code before
45 * and after each child of an interior node, and at each leaf.
46 */
47func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc {
48 curNode := tree.root
49 curChild := 0
50
51 for {
52 if len(curNode.children) == 0 {
53 // This is a leaf node
54 s.calculateFC(curNode.t, curNode, 0)
55 } else if curChild < len(curNode.children) && !s.skipAllChildren {
56 // This is an interior node, and we have more children to analyze
57 s.calculateFC(curNode.t|beforeChild, curNode, curChild)
58
59 if !s.skipchild {
60 curNode = curNode.children[curChild]
61 // this stack is how we get a depth first walk of the tree.
62 s.pushInt(curChild)
63 curChild = 0
64 } else {
65 curChild++
66 s.skipchild = false
67 }
68 continue
69 }
70
71 // This is an interior node where we've finished analyzing all the children, or
72 // the end of a leaf node.
73 s.skipAllChildren = false
74
75 if s.intIsEmpty() {
76 break
77 }
78
79 curChild = s.popInt()
80 curNode = curNode.next
81
82 s.calculateFC(curNode.t|afterChild, curNode, curChild)
83 if s.failed {
84 return nil
85 }
86
87 curChild++
88 }
89
90 if s.fcIsEmpty() {
91 return nil
92 }
93
94 return s.popFC()
95}
96
97// To avoid recursion, we use a simple integer stack.
98// This is the push.
99func (s *regexFcd) pushInt(I int) {
100 if s.intDepth >= len(s.intStack) {
101 expanded := make([]int, s.intDepth*2)
102 copy(expanded, s.intStack)
103 s.intStack = expanded
104 }
105
106 s.intStack[s.intDepth] = I
107 s.intDepth++
108}
109
110// True if the stack is empty.
111func (s *regexFcd) intIsEmpty() bool {
112 return s.intDepth == 0
113}
114
115// This is the pop.
116func (s *regexFcd) popInt() int {
117 s.intDepth--
118 return s.intStack[s.intDepth]
119}
120
121// We also use a stack of RegexFC objects.
122// This is the push.
123func (s *regexFcd) pushFC(fc regexFc) {
124 if s.fcDepth >= len(s.fcStack) {
125 expanded := make([]regexFc, s.fcDepth*2)
126 copy(expanded, s.fcStack)
127 s.fcStack = expanded
128 }
129
130 s.fcStack[s.fcDepth] = fc
131 s.fcDepth++
132}
133
134// True if the stack is empty.
135func (s *regexFcd) fcIsEmpty() bool {
136 return s.fcDepth == 0
137}
138
139// This is the pop.
140func (s *regexFcd) popFC() *regexFc {
141 s.fcDepth--
142 return &s.fcStack[s.fcDepth]
143}
144
145// This is the top.
146func (s *regexFcd) topFC() *regexFc {
147 return &s.fcStack[s.fcDepth-1]
148}
149
150// Called in Beforechild to prevent further processing of the current child
151func (s *regexFcd) skipChild() {
152 s.skipchild = true
153}
154
155// FC computation and shortcut cases for each node type
156func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) {
157 //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description())
158 ci := false
159 rtl := false
160
161 if nt <= ntRef {
162 if (node.options & IgnoreCase) != 0 {
163 ci = true
164 }
165 if (node.options & RightToLeft) != 0 {
166 rtl = true
167 }
168 }
169
170 switch nt {
171 case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild:
172 break
173
174 case ntTestgroup | beforeChild:
175 if CurIndex == 0 {
176 s.skipChild()
177 }
178 break
179
180 case ntEmpty:
181 s.pushFC(regexFc{nullable: true})
182 break
183
184 case ntConcatenate | afterChild:
185 if CurIndex != 0 {
186 child := s.popFC()
187 cumul := s.topFC()
188
189 s.failed = !cumul.addFC(*child, true)
190 }
191
192 fc := s.topFC()
193 if !fc.nullable {
194 s.skipAllChildren = true
195 }
196 break
197
198 case ntTestgroup | afterChild:
199 if CurIndex > 1 {
200 child := s.popFC()
201 cumul := s.topFC()
202
203 s.failed = !cumul.addFC(*child, false)
204 }
205 break
206
207 case ntAlternate | afterChild, ntTestref | afterChild:
208 if CurIndex != 0 {
209 child := s.popFC()
210 cumul := s.topFC()
211
212 s.failed = !cumul.addFC(*child, false)
213 }
214 break
215
216 case ntLoop | afterChild, ntLazyloop | afterChild:
217 if node.m == 0 {
218 fc := s.topFC()
219 fc.nullable = true
220 }
221 break
222
223 case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild:
224 break
225
226 case ntRequire | beforeChild, ntPrevent | beforeChild:
227 s.skipChild()
228 s.pushFC(regexFc{nullable: true})
229 break
230
231 case ntRequire | afterChild, ntPrevent | afterChild:
232 break
233
234 case ntOne, ntNotone:
235 s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci))
236 break
237
238 case ntOneloop, ntOnelazy:
239 s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci))
240 break
241
242 case ntNotoneloop, ntNotonelazy:
243 s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci))
244 break
245
246 case ntMulti:
247 if len(node.str) == 0 {
248 s.pushFC(regexFc{nullable: true})
249 } else if !rtl {
250 s.pushFC(newRegexFc(node.str[0], false, false, ci))
251 } else {
252 s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci))
253 }
254 break
255
256 case ntSet:
257 s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci})
258 break
259
260 case ntSetloop, ntSetlazy:
261 s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci})
262 break
263
264 case ntRef:
265 s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false})
266 break
267
268 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
269 s.pushFC(regexFc{nullable: true})
270 break
271
272 default:
273 panic(fmt.Sprintf("unexpected op code: %v", nt))
274 }
275}
276
277type regexFc struct {
278 cc CharSet
279 nullable bool
280 caseInsensitive bool
281}
282
283func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc {
284 r := regexFc{
285 caseInsensitive: caseInsensitive,
286 nullable: nullable,
287 }
288 if not {
289 if ch > 0 {
290 r.cc.addRange('\x00', ch-1)
291 }
292 if ch < 0xFFFF {
293 r.cc.addRange(ch+1, utf8.MaxRune)
294 }
295 } else {
296 r.cc.addRange(ch, ch)
297 }
298 return r
299}
300
301func (r *regexFc) getFirstChars() CharSet {
302 if r.caseInsensitive {
303 r.cc.addLowercase()
304 }
305
306 return r.cc
307}
308
309func (r *regexFc) addFC(fc regexFc, concatenate bool) bool {
310 if !r.cc.IsMergeable() || !fc.cc.IsMergeable() {
311 return false
312 }
313
314 if concatenate {
315 if !r.nullable {
316 return true
317 }
318
319 if !fc.nullable {
320 r.nullable = false
321 }
322 } else {
323 if fc.nullable {
324 r.nullable = true
325 }
326 }
327
328 r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive
329 r.cc.addSet(fc.cc)
330
331 return true
332}
333
334// This is a related computation: it takes a RegexTree and computes the
335// leading substring if it sees one. It's quite trivial and gives up easily.
336func getPrefix(tree *RegexTree) *Prefix {
337 var concatNode *regexNode
338 nextChild := 0
339
340 curNode := tree.root
341
342 for {
343 switch curNode.t {
344 case ntConcatenate:
345 if len(curNode.children) > 0 {
346 concatNode = curNode
347 nextChild = 0
348 }
349
350 case ntGreedy, ntCapture:
351 curNode = curNode.children[0]
352 concatNode = nil
353 continue
354
355 case ntOneloop, ntOnelazy:
356 if curNode.m > 0 {
357 return &Prefix{
358 PrefixStr: repeat(curNode.ch, curNode.m),
359 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
360 }
361 }
362 return nil
363
364 case ntOne:
365 return &Prefix{
366 PrefixStr: []rune{curNode.ch},
367 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
368 }
369
370 case ntMulti:
371 return &Prefix{
372 PrefixStr: curNode.str,
373 CaseInsensitive: (curNode.options & IgnoreCase) != 0,
374 }
375
376 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart,
377 ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent:
378
379 default:
380 return nil
381 }
382
383 if concatNode == nil || nextChild >= len(concatNode.children) {
384 return nil
385 }
386
387 curNode = concatNode.children[nextChild]
388 nextChild++
389 }
390}
391
392// repeat the rune r, c times... up to the max of MaxPrefixSize
393func repeat(r rune, c int) []rune {
394 if c > MaxPrefixSize {
395 c = MaxPrefixSize
396 }
397
398 ret := make([]rune, c)
399
400 // binary growth using copy for speed
401 ret[0] = r
402 bp := 1
403 for bp < len(ret) {
404 copy(ret[bp:], ret[:bp])
405 bp *= 2
406 }
407
408 return ret
409}
410
411// BmPrefix precomputes the Boyer-Moore
412// tables for fast string scanning. These tables allow
413// you to scan for the first occurrence of a string within
414// a large body of text without examining every character.
415// The performance of the heuristic depends on the actual
416// string and the text being searched, but usually, the longer
417// the string that is being searched for, the fewer characters
418// need to be examined.
419type BmPrefix struct {
420 positive []int
421 negativeASCII []int
422 negativeUnicode [][]int
423 pattern []rune
424 lowASCII rune
425 highASCII rune
426 rightToLeft bool
427 caseInsensitive bool
428}
429
430func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix {
431
432 b := &BmPrefix{
433 rightToLeft: rightToLeft,
434 caseInsensitive: caseInsensitive,
435 pattern: pattern,
436 }
437
438 if caseInsensitive {
439 for i := 0; i < len(b.pattern); i++ {
440 // We do the ToLower character by character for consistency. With surrogate chars, doing
441 // a ToLower on the entire string could actually change the surrogate pair. This is more correct
442 // linguistically, but since Regex doesn't support surrogates, it's more important to be
443 // consistent.
444
445 b.pattern[i] = unicode.ToLower(b.pattern[i])
446 }
447 }
448
449 var beforefirst, last, bump int
450 var scan, match int
451
452 if !rightToLeft {
453 beforefirst = -1
454 last = len(b.pattern) - 1
455 bump = 1
456 } else {
457 beforefirst = len(b.pattern)
458 last = 0
459 bump = -1
460 }
461
462 // PART I - the good-suffix shift table
463 //
464 // compute the positive requirement:
465 // if char "i" is the first one from the right that doesn't match,
466 // then we know the matcher can advance by _positive[i].
467 //
468 // This algorithm is a simplified variant of the standard
469 // Boyer-Moore good suffix calculation.
470
471 b.positive = make([]int, len(b.pattern))
472
473 examine := last
474 ch := b.pattern[examine]
475 b.positive[examine] = bump
476 examine -= bump
477
478Outerloop:
479 for {
480 // find an internal char (examine) that matches the tail
481
482 for {
483 if examine == beforefirst {
484 break Outerloop
485 }
486 if b.pattern[examine] == ch {
487 break
488 }
489 examine -= bump
490 }
491
492 match = last
493 scan = examine
494
495 // find the length of the match
496 for {
497 if scan == beforefirst || b.pattern[match] != b.pattern[scan] {
498 // at the end of the match, note the difference in _positive
499 // this is not the length of the match, but the distance from the internal match
500 // to the tail suffix.
501 if b.positive[match] == 0 {
502 b.positive[match] = match - scan
503 }
504
505 // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
506
507 break
508 }
509
510 scan -= bump
511 match -= bump
512 }
513
514 examine -= bump
515 }
516
517 match = last - bump
518
519 // scan for the chars for which there are no shifts that yield a different candidate
520
521 // The inside of the if statement used to say
522 // "_positive[match] = last - beforefirst;"
523 // This is slightly less aggressive in how much we skip, but at worst it
524 // should mean a little more work rather than skipping a potential match.
525 for match != beforefirst {
526 if b.positive[match] == 0 {
527 b.positive[match] = bump
528 }
529
530 match -= bump
531 }
532
533 // PART II - the bad-character shift table
534 //
535 // compute the negative requirement:
536 // if char "ch" is the reject character when testing position "i",
537 // we can slide up by _negative[ch];
538 // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
539 //
540 // the lookup table is divided into ASCII and Unicode portions;
541 // only those parts of the Unicode 16-bit code set that actually
542 // appear in the string are in the table. (Maximum size with
543 // Unicode is 65K; ASCII only case is 512 bytes.)
544
545 b.negativeASCII = make([]int, 128)
546
547 for i := 0; i < len(b.negativeASCII); i++ {
548 b.negativeASCII[i] = last - beforefirst
549 }
550
551 b.lowASCII = 127
552 b.highASCII = 0
553
554 for examine = last; examine != beforefirst; examine -= bump {
555 ch = b.pattern[examine]
556
557 switch {
558 case ch < 128:
559 if b.lowASCII > ch {
560 b.lowASCII = ch
561 }
562
563 if b.highASCII < ch {
564 b.highASCII = ch
565 }
566
567 if b.negativeASCII[ch] == last-beforefirst {
568 b.negativeASCII[ch] = last - examine
569 }
570 case ch <= 0xffff:
571 i, j := ch>>8, ch&0xFF
572
573 if b.negativeUnicode == nil {
574 b.negativeUnicode = make([][]int, 256)
575 }
576
577 if b.negativeUnicode[i] == nil {
578 newarray := make([]int, 256)
579
580 for k := 0; k < len(newarray); k++ {
581 newarray[k] = last - beforefirst
582 }
583
584 if i == 0 {
585 copy(newarray, b.negativeASCII)
586 //TODO: this line needed?
587 b.negativeASCII = newarray
588 }
589
590 b.negativeUnicode[i] = newarray
591 }
592
593 if b.negativeUnicode[i][j] == last-beforefirst {
594 b.negativeUnicode[i][j] = last - examine
595 }
596 default:
597 // we can't do the filter because this algo doesn't support
598 // unicode chars >0xffff
599 return nil
600 }
601 }
602
603 return b
604}
605
606func (b *BmPrefix) String() string {
607 return string(b.pattern)
608}
609
610// Dump returns the contents of the filter as a human readable string
611func (b *BmPrefix) Dump(indent string) string {
612 buf := &bytes.Buffer{}
613
614 fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent)
615 for i := 0; i < len(b.positive); i++ {
616 buf.WriteString(strconv.Itoa(b.positive[i]))
617 buf.WriteRune(' ')
618 }
619 buf.WriteRune('\n')
620
621 if b.negativeASCII != nil {
622 buf.WriteString(indent)
623 buf.WriteString("Negative table\n")
624 for i := 0; i < len(b.negativeASCII); i++ {
625 if b.negativeASCII[i] != len(b.pattern) {
626 fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i]))
627 }
628 }
629 }
630
631 return buf.String()
632}
633
634// Scan uses the Boyer-Moore algorithm to find the first occurrence
635// of the specified string within text, beginning at index, and
636// constrained within beglimit and endlimit.
637//
638// The direction and case-sensitivity of the match is determined
639// by the arguments to the RegexBoyerMoore constructor.
640func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
641 var (
642 defadv, test, test2 int
643 match, startmatch, endmatch int
644 bump, advance int
645 chTest rune
646 unicodeLookup []int
647 )
648
649 if !b.rightToLeft {
650 defadv = len(b.pattern)
651 startmatch = len(b.pattern) - 1
652 endmatch = 0
653 test = index + defadv - 1
654 bump = 1
655 } else {
656 defadv = -len(b.pattern)
657 startmatch = 0
658 endmatch = -defadv - 1
659 test = index + defadv
660 bump = -1
661 }
662
663 chMatch := b.pattern[startmatch]
664
665 for {
666 if test >= endlimit || test < beglimit {
667 return -1
668 }
669
670 chTest = text[test]
671
672 if b.caseInsensitive {
673 chTest = unicode.ToLower(chTest)
674 }
675
676 if chTest != chMatch {
677 if chTest < 128 {
678 advance = b.negativeASCII[chTest]
679 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
680 unicodeLookup = b.negativeUnicode[chTest>>8]
681 if len(unicodeLookup) > 0 {
682 advance = unicodeLookup[chTest&0xFF]
683 } else {
684 advance = defadv
685 }
686 } else {
687 advance = defadv
688 }
689
690 test += advance
691 } else { // if (chTest == chMatch)
692 test2 = test
693 match = startmatch
694
695 for {
696 if match == endmatch {
697 if b.rightToLeft {
698 return test2 + 1
699 } else {
700 return test2
701 }
702 }
703
704 match -= bump
705 test2 -= bump
706
707 chTest = text[test2]
708
709 if b.caseInsensitive {
710 chTest = unicode.ToLower(chTest)
711 }
712
713 if chTest != b.pattern[match] {
714 advance = b.positive[match]
715 if chTest < 128 {
716 test2 = (match - startmatch) + b.negativeASCII[chTest]
717 } else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
718 unicodeLookup = b.negativeUnicode[chTest>>8]
719 if len(unicodeLookup) > 0 {
720 test2 = (match - startmatch) + unicodeLookup[chTest&0xFF]
721 } else {
722 test += advance
723 break
724 }
725 } else {
726 test += advance
727 break
728 }
729
730 if b.rightToLeft {
731 if test2 < advance {
732 advance = test2
733 }
734 } else if test2 > advance {
735 advance = test2
736 }
737
738 test += advance
739 break
740 }
741 }
742 }
743 }
744}
745
746// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
747func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool {
748 if !b.rightToLeft {
749 if index < beglimit || endlimit-index < len(b.pattern) {
750 return false
751 }
752
753 return b.matchPattern(text, index)
754 } else {
755 if index > endlimit || index-beglimit < len(b.pattern) {
756 return false
757 }
758
759 return b.matchPattern(text, index-len(b.pattern))
760 }
761}
762
763func (b *BmPrefix) matchPattern(text []rune, index int) bool {
764 if len(text)-index < len(b.pattern) {
765 return false
766 }
767
768 if b.caseInsensitive {
769 for i := 0; i < len(b.pattern); i++ {
770 //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
771 if unicode.ToLower(text[index+i]) != b.pattern[i] {
772 return false
773 }
774 }
775 return true
776 } else {
777 for i := 0; i < len(b.pattern); i++ {
778 if text[index+i] != b.pattern[i] {
779 return false
780 }
781 }
782 return true
783 }
784}
785
786type AnchorLoc int16
787
788// where the regex can be pegged
789const (
790 AnchorBeginning AnchorLoc = 0x0001
791 AnchorBol = 0x0002
792 AnchorStart = 0x0004
793 AnchorEol = 0x0008
794 AnchorEndZ = 0x0010
795 AnchorEnd = 0x0020
796 AnchorBoundary = 0x0040
797 AnchorECMABoundary = 0x0080
798)
799
800func getAnchors(tree *RegexTree) AnchorLoc {
801
802 var concatNode *regexNode
803 nextChild, result := 0, AnchorLoc(0)
804
805 curNode := tree.root
806
807 for {
808 switch curNode.t {
809 case ntConcatenate:
810 if len(curNode.children) > 0 {
811 concatNode = curNode
812 nextChild = 0
813 }
814
815 case ntGreedy, ntCapture:
816 curNode = curNode.children[0]
817 concatNode = nil
818 continue
819
820 case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning,
821 ntStart, ntEndZ, ntEnd:
822 return result | anchorFromType(curNode.t)
823
824 case ntEmpty, ntRequire, ntPrevent:
825
826 default:
827 return result
828 }
829
830 if concatNode == nil || nextChild >= len(concatNode.children) {
831 return result
832 }
833
834 curNode = concatNode.children[nextChild]
835 nextChild++
836 }
837}
838
839func anchorFromType(t nodeType) AnchorLoc {
840 switch t {
841 case ntBol:
842 return AnchorBol
843 case ntEol:
844 return AnchorEol
845 case ntBoundary:
846 return AnchorBoundary
847 case ntECMABoundary:
848 return AnchorECMABoundary
849 case ntBeginning:
850 return AnchorBeginning
851 case ntStart:
852 return AnchorStart
853 case ntEndZ:
854 return AnchorEndZ
855 case ntEnd:
856 return AnchorEnd
857 default:
858 return 0
859 }
860}
861
862// anchorDescription returns a human-readable description of the anchors
863func (anchors AnchorLoc) String() string {
864 buf := &bytes.Buffer{}
865
866 if 0 != (anchors & AnchorBeginning) {
867 buf.WriteString(", Beginning")
868 }
869 if 0 != (anchors & AnchorStart) {
870 buf.WriteString(", Start")
871 }
872 if 0 != (anchors & AnchorBol) {
873 buf.WriteString(", Bol")
874 }
875 if 0 != (anchors & AnchorBoundary) {
876 buf.WriteString(", Boundary")
877 }
878 if 0 != (anchors & AnchorECMABoundary) {
879 buf.WriteString(", ECMABoundary")
880 }
881 if 0 != (anchors & AnchorEol) {
882 buf.WriteString(", Eol")
883 }
884 if 0 != (anchors & AnchorEnd) {
885 buf.WriteString(", End")
886 }
887 if 0 != (anchors & AnchorEndZ) {
888 buf.WriteString(", EndZ")
889 }
890
891 // trim off comma
892 if buf.Len() >= 2 {
893 return buf.String()[2:]
894 }
895 return "None"
896}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
new file mode 100644
index 0000000..bcf4d3f
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go
@@ -0,0 +1,87 @@
1package syntax
2
3import (
4 "bytes"
5 "errors"
6)
7
8type ReplacerData struct {
9 Rep string
10 Strings []string
11 Rules []int
12}
13
14const (
15 replaceSpecials = 4
16 replaceLeftPortion = -1
17 replaceRightPortion = -2
18 replaceLastGroup = -3
19 replaceWholeString = -4
20)
21
22//ErrReplacementError is a general error during parsing the replacement text
23var ErrReplacementError = errors.New("Replacement pattern error.")
24
25// NewReplacerData will populate a reusable replacer data struct based on the given replacement string
26// and the capture group data from a regexp
27func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) {
28 p := parser{
29 options: op,
30 caps: caps,
31 capsize: capsize,
32 capnames: capnames,
33 }
34 p.setPattern(rep)
35 concat, err := p.scanReplacement()
36 if err != nil {
37 return nil, err
38 }
39
40 if concat.t != ntConcatenate {
41 panic(ErrReplacementError)
42 }
43
44 sb := &bytes.Buffer{}
45 var (
46 strings []string
47 rules []int
48 )
49
50 for _, child := range concat.children {
51 switch child.t {
52 case ntMulti:
53 child.writeStrToBuf(sb)
54
55 case ntOne:
56 sb.WriteRune(child.ch)
57
58 case ntRef:
59 if sb.Len() > 0 {
60 rules = append(rules, len(strings))
61 strings = append(strings, sb.String())
62 sb.Reset()
63 }
64 slot := child.m
65
66 if len(caps) > 0 && slot >= 0 {
67 slot = caps[slot]
68 }
69
70 rules = append(rules, -replaceSpecials-1-slot)
71
72 default:
73 panic(ErrReplacementError)
74 }
75 }
76
77 if sb.Len() > 0 {
78 rules = append(rules, len(strings))
79 strings = append(strings, sb.String())
80 }
81
82 return &ReplacerData{
83 Rep: rep,
84 Strings: strings,
85 Rules: rules,
86 }, nil
87}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/tree.go b/vendor/github.com/dlclark/regexp2/syntax/tree.go
new file mode 100644
index 0000000..ea28829
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/tree.go
@@ -0,0 +1,654 @@
1package syntax
2
3import (
4 "bytes"
5 "fmt"
6 "math"
7 "strconv"
8)
9
10type RegexTree struct {
11 root *regexNode
12 caps map[int]int
13 capnumlist []int
14 captop int
15 Capnames map[string]int
16 Caplist []string
17 options RegexOptions
18}
19
20// It is built into a parsed tree for a regular expression.
21
22// Implementation notes:
23//
24// Since the node tree is a temporary data structure only used
25// during compilation of the regexp to integer codes, it's
26// designed for clarity and convenience rather than
27// space efficiency.
28//
29// RegexNodes are built into a tree, linked by the n.children list.
30// Each node also has a n.parent and n.ichild member indicating
31// its parent and which child # it is in its parent's list.
32//
33// RegexNodes come in as many types as there are constructs in
34// a regular expression, for example, "concatenate", "alternate",
35// "one", "rept", "group". There are also node types for basic
36// peephole optimizations, e.g., "onerep", "notsetrep", etc.
37//
38// Because perl 5 allows "lookback" groups that scan backwards,
39// each node also gets a "direction". Normally the value of
40// boolean n.backward = false.
41//
42// During parsing, top-level nodes are also stacked onto a parse
43// stack (a stack of trees). For this purpose we have a n.next
44// pointer. [Note that to save a few bytes, we could overload the
45// n.parent pointer instead.]
46//
47// On the parse stack, each tree has a "role" - basically, the
48// nonterminal in the grammar that the parser has currently
49// assigned to the tree. That code is stored in n.role.
50//
51// Finally, some of the different kinds of nodes have data.
52// Two integers (for the looping constructs) are stored in
53// n.operands, an an object (either a string or a set)
54// is stored in n.data
55type regexNode struct {
56 t nodeType
57 children []*regexNode
58 str []rune
59 set *CharSet
60 ch rune
61 m int
62 n int
63 options RegexOptions
64 next *regexNode
65}
66
67type nodeType int32
68
69const (
70 // The following are leaves, and correspond to primitive operations
71
72 ntOnerep nodeType = 0 // lef,back char,min,max a {n}
73 ntNotonerep = 1 // lef,back char,min,max .{n}
74 ntSetrep = 2 // lef,back set,min,max [\d]{n}
75 ntOneloop = 3 // lef,back char,min,max a {,n}
76 ntNotoneloop = 4 // lef,back char,min,max .{,n}
77 ntSetloop = 5 // lef,back set,min,max [\d]{,n}
78 ntOnelazy = 6 // lef,back char,min,max a {,n}?
79 ntNotonelazy = 7 // lef,back char,min,max .{,n}?
80 ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
81 ntOne = 9 // lef char a
82 ntNotone = 10 // lef char [^a]
83 ntSet = 11 // lef set [a-z\s] \w \s \d
84 ntMulti = 12 // lef string abcd
85 ntRef = 13 // lef group \#
86 ntBol = 14 // ^
87 ntEol = 15 // $
88 ntBoundary = 16 // \b
89 ntNonboundary = 17 // \B
90 ntBeginning = 18 // \A
91 ntStart = 19 // \G
92 ntEndZ = 20 // \Z
93 ntEnd = 21 // \Z
94
95 // Interior nodes do not correspond to primitive operations, but
96 // control structures compositing other operations
97
98 // Concat and alternate take n children, and can run forward or backwards
99
100 ntNothing = 22 // []
101 ntEmpty = 23 // ()
102 ntAlternate = 24 // a|b
103 ntConcatenate = 25 // ab
104 ntLoop = 26 // m,x * + ? {,}
105 ntLazyloop = 27 // m,x *? +? ?? {,}?
106 ntCapture = 28 // n ()
107 ntGroup = 29 // (?:)
108 ntRequire = 30 // (?=) (?<=)
109 ntPrevent = 31 // (?!) (?<!)
110 ntGreedy = 32 // (?>) (?<)
111 ntTestref = 33 // (?(n) | )
112 ntTestgroup = 34 // (?(...) | )
113
114 ntECMABoundary = 41 // \b
115 ntNonECMABoundary = 42 // \B
116)
117
118func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
119 return &regexNode{
120 t: t,
121 options: opt,
122 }
123}
124
125func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
126 return &regexNode{
127 t: t,
128 options: opt,
129 ch: ch,
130 }
131}
132
133func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
134 return &regexNode{
135 t: t,
136 options: opt,
137 str: str,
138 }
139}
140
141func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
142 return &regexNode{
143 t: t,
144 options: opt,
145 set: set,
146 }
147}
148
149func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
150 return &regexNode{
151 t: t,
152 options: opt,
153 m: m,
154 }
155}
156func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
157 return &regexNode{
158 t: t,
159 options: opt,
160 m: m,
161 n: n,
162 }
163}
164
165func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
166 for i := 0; i < len(n.str); i++ {
167 buf.WriteRune(n.str[i])
168 }
169}
170
171func (n *regexNode) addChild(child *regexNode) {
172 reduced := child.reduce()
173 n.children = append(n.children, reduced)
174 reduced.next = n
175}
176
177func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
178 newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
179 n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
180}
181
182// removes children including the start but not the end index
183func (n *regexNode) removeChildren(startIndex, endIndex int) {
184 n.children = append(n.children[:startIndex], n.children[endIndex:]...)
185}
186
187// Pass type as OneLazy or OneLoop
188func (n *regexNode) makeRep(t nodeType, min, max int) {
189 n.t += (t - ntOne)
190 n.m = min
191 n.n = max
192}
193
194func (n *regexNode) reduce() *regexNode {
195 switch n.t {
196 case ntAlternate:
197 return n.reduceAlternation()
198
199 case ntConcatenate:
200 return n.reduceConcatenation()
201
202 case ntLoop, ntLazyloop:
203 return n.reduceRep()
204
205 case ntGroup:
206 return n.reduceGroup()
207
208 case ntSet, ntSetloop:
209 return n.reduceSet()
210
211 default:
212 return n
213 }
214}
215
216// Basic optimization. Single-letter alternations can be replaced
217// by faster set specifications, and nested alternations with no
218// intervening operators can be flattened:
219//
220// a|b|c|def|g|h -> [a-c]|def|[gh]
221// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
222func (n *regexNode) reduceAlternation() *regexNode {
223 if len(n.children) == 0 {
224 return newRegexNode(ntNothing, n.options)
225 }
226
227 wasLastSet := false
228 lastNodeCannotMerge := false
229 var optionsLast RegexOptions
230 var i, j int
231
232 for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
233 at := n.children[i]
234
235 if j < i {
236 n.children[j] = at
237 }
238
239 for {
240 if at.t == ntAlternate {
241 for k := 0; k < len(at.children); k++ {
242 at.children[k].next = n
243 }
244 n.insertChildren(i+1, at.children)
245
246 j--
247 } else if at.t == ntSet || at.t == ntOne {
248 // Cannot merge sets if L or I options differ, or if either are negated.
249 optionsAt := at.options & (RightToLeft | IgnoreCase)
250
251 if at.t == ntSet {
252 if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
253 wasLastSet = true
254 lastNodeCannotMerge = !at.set.IsMergeable()
255 optionsLast = optionsAt
256 break
257 }
258 } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
259 wasLastSet = true
260 lastNodeCannotMerge = false
261 optionsLast = optionsAt
262 break
263 }
264
265 // The last node was a Set or a One, we're a Set or One and our options are the same.
266 // Merge the two nodes.
267 j--
268 prev := n.children[j]
269
270 var prevCharClass *CharSet
271 if prev.t == ntOne {
272 prevCharClass = &CharSet{}
273 prevCharClass.addChar(prev.ch)
274 } else {
275 prevCharClass = prev.set
276 }
277
278 if at.t == ntOne {
279 prevCharClass.addChar(at.ch)
280 } else {
281 prevCharClass.addSet(*at.set)
282 }
283
284 prev.t = ntSet
285 prev.set = prevCharClass
286 } else if at.t == ntNothing {
287 j--
288 } else {
289 wasLastSet = false
290 lastNodeCannotMerge = false
291 }
292 break
293 }
294 }
295
296 if j < i {
297 n.removeChildren(j, i)
298 }
299
300 return n.stripEnation(ntNothing)
301}
302
303// Basic optimization. Adjacent strings can be concatenated.
304//
305// (?:abc)(?:def) -> abcdef
306func (n *regexNode) reduceConcatenation() *regexNode {
307 // Eliminate empties and concat adjacent strings/chars
308
309 var optionsLast RegexOptions
310 var optionsAt RegexOptions
311 var i, j int
312
313 if len(n.children) == 0 {
314 return newRegexNode(ntEmpty, n.options)
315 }
316
317 wasLastString := false
318
319 for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
320 var at, prev *regexNode
321
322 at = n.children[i]
323
324 if j < i {
325 n.children[j] = at
326 }
327
328 if at.t == ntConcatenate &&
329 ((at.options & RightToLeft) == (n.options & RightToLeft)) {
330 for k := 0; k < len(at.children); k++ {
331 at.children[k].next = n
332 }
333
334 //insert at.children at i+1 index in n.children
335 n.insertChildren(i+1, at.children)
336
337 j--
338 } else if at.t == ntMulti || at.t == ntOne {
339 // Cannot merge strings if L or I options differ
340 optionsAt = at.options & (RightToLeft | IgnoreCase)
341
342 if !wasLastString || optionsLast != optionsAt {
343 wasLastString = true
344 optionsLast = optionsAt
345 continue
346 }
347
348 j--
349 prev = n.children[j]
350
351 if prev.t == ntOne {
352 prev.t = ntMulti
353 prev.str = []rune{prev.ch}
354 }
355
356 if (optionsAt & RightToLeft) == 0 {
357 if at.t == ntOne {
358 prev.str = append(prev.str, at.ch)
359 } else {
360 prev.str = append(prev.str, at.str...)
361 }
362 } else {
363 if at.t == ntOne {
364 // insert at the front by expanding our slice, copying the data over, and then setting the value
365 prev.str = append(prev.str, 0)
366 copy(prev.str[1:], prev.str)
367 prev.str[0] = at.ch
368 } else {
369 //insert at the front...this one we'll make a new slice and copy both into it
370 merge := make([]rune, len(prev.str)+len(at.str))
371 copy(merge, at.str)
372 copy(merge[len(at.str):], prev.str)
373 prev.str = merge
374 }
375 }
376 } else if at.t == ntEmpty {
377 j--
378 } else {
379 wasLastString = false
380 }
381 }
382
383 if j < i {
384 // remove indices j through i from the children
385 n.removeChildren(j, i)
386 }
387
388 return n.stripEnation(ntEmpty)
389}
390
391// Nested repeaters just get multiplied with each other if they're not
392// too lumpy
393func (n *regexNode) reduceRep() *regexNode {
394
395 u := n
396 t := n.t
397 min := n.m
398 max := n.n
399
400 for {
401 if len(u.children) == 0 {
402 break
403 }
404
405 child := u.children[0]
406
407 // multiply reps of the same type only
408 if child.t != t {
409 childType := child.t
410
411 if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
412 childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
413 break
414 }
415 }
416
417 // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
418 // [but things like (a {2,})+ are not too lumpy...]
419 if u.m == 0 && child.m > 1 || child.n < child.m*2 {
420 break
421 }
422
423 u = child
424 if u.m > 0 {
425 if (math.MaxInt32-1)/u.m < min {
426 u.m = math.MaxInt32
427 } else {
428 u.m = u.m * min
429 }
430 }
431 if u.n > 0 {
432 if (math.MaxInt32-1)/u.n < max {
433 u.n = math.MaxInt32
434 } else {
435 u.n = u.n * max
436 }
437 }
438 }
439
440 if math.MaxInt32 == min {
441 return newRegexNode(ntNothing, n.options)
442 }
443 return u
444
445}
446
447// Simple optimization. If a concatenation or alternation has only
448// one child strip out the intermediate node. If it has zero children,
449// turn it into an empty.
450func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
451 switch len(n.children) {
452 case 0:
453 return newRegexNode(emptyType, n.options)
454 case 1:
455 return n.children[0]
456 default:
457 return n
458 }
459}
460
461func (n *regexNode) reduceGroup() *regexNode {
462 u := n
463
464 for u.t == ntGroup {
465 u = u.children[0]
466 }
467
468 return u
469}
470
471// Simple optimization. If a set is a singleton, an inverse singleton,
472// or empty, it's transformed accordingly.
473func (n *regexNode) reduceSet() *regexNode {
474 // Extract empty-set, one and not-one case as special
475
476 if n.set == nil {
477 n.t = ntNothing
478 } else if n.set.IsSingleton() {
479 n.ch = n.set.SingletonChar()
480 n.set = nil
481 n.t += (ntOne - ntSet)
482 } else if n.set.IsSingletonInverse() {
483 n.ch = n.set.SingletonChar()
484 n.set = nil
485 n.t += (ntNotone - ntSet)
486 }
487
488 return n
489}
490
491func (n *regexNode) reverseLeft() *regexNode {
492 if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
493 //reverse children order
494 for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
495 n.children[left], n.children[right] = n.children[right], n.children[left]
496 }
497 }
498
499 return n
500}
501
502func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
503 if min == 0 && max == 0 {
504 return newRegexNode(ntEmpty, n.options)
505 }
506
507 if min == 1 && max == 1 {
508 return n
509 }
510
511 switch n.t {
512 case ntOne, ntNotone, ntSet:
513 if lazy {
514 n.makeRep(Onelazy, min, max)
515 } else {
516 n.makeRep(Oneloop, min, max)
517 }
518 return n
519
520 default:
521 var t nodeType
522 if lazy {
523 t = ntLazyloop
524 } else {
525 t = ntLoop
526 }
527 result := newRegexNodeMN(t, n.options, min, max)
528 result.addChild(n)
529 return result
530 }
531}
532
533// debug functions
534
535var typeStr = []string{
536 "Onerep", "Notonerep", "Setrep",
537 "Oneloop", "Notoneloop", "Setloop",
538 "Onelazy", "Notonelazy", "Setlazy",
539 "One", "Notone", "Set",
540 "Multi", "Ref",
541 "Bol", "Eol", "Boundary", "Nonboundary",
542 "Beginning", "Start", "EndZ", "End",
543 "Nothing", "Empty",
544 "Alternate", "Concatenate",
545 "Loop", "Lazyloop",
546 "Capture", "Group", "Require", "Prevent", "Greedy",
547 "Testref", "Testgroup",
548 "Unknown", "Unknown", "Unknown",
549 "Unknown", "Unknown", "Unknown",
550 "ECMABoundary", "NonECMABoundary",
551}
552
553func (n *regexNode) description() string {
554 buf := &bytes.Buffer{}
555
556 buf.WriteString(typeStr[n.t])
557
558 if (n.options & ExplicitCapture) != 0 {
559 buf.WriteString("-C")
560 }
561 if (n.options & IgnoreCase) != 0 {
562 buf.WriteString("-I")
563 }
564 if (n.options & RightToLeft) != 0 {
565 buf.WriteString("-L")
566 }
567 if (n.options & Multiline) != 0 {
568 buf.WriteString("-M")
569 }
570 if (n.options & Singleline) != 0 {
571 buf.WriteString("-S")
572 }
573 if (n.options & IgnorePatternWhitespace) != 0 {
574 buf.WriteString("-X")
575 }
576 if (n.options & ECMAScript) != 0 {
577 buf.WriteString("-E")
578 }
579
580 switch n.t {
581 case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
582 buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
583 break
584 case ntCapture:
585 buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
586 break
587 case ntRef, ntTestref:
588 buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
589 break
590 case ntMulti:
591 fmt.Fprintf(buf, "(String = %s)", string(n.str))
592 break
593 case ntSet, ntSetloop, ntSetlazy:
594 buf.WriteString("(Set = " + n.set.String() + ")")
595 break
596 }
597
598 switch n.t {
599 case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
600 buf.WriteString("(Min = ")
601 buf.WriteString(strconv.Itoa(n.m))
602 buf.WriteString(", Max = ")
603 if n.n == math.MaxInt32 {
604 buf.WriteString("inf")
605 } else {
606 buf.WriteString(strconv.Itoa(n.n))
607 }
608 buf.WriteString(")")
609
610 break
611 }
612
613 return buf.String()
614}
615
616var padSpace = []byte(" ")
617
618func (t *RegexTree) Dump() string {
619 return t.root.dump()
620}
621
622func (n *regexNode) dump() string {
623 var stack []int
624 CurNode := n
625 CurChild := 0
626
627 buf := bytes.NewBufferString(CurNode.description())
628 buf.WriteRune('\n')
629
630 for {
631 if CurNode.children != nil && CurChild < len(CurNode.children) {
632 stack = append(stack, CurChild+1)
633 CurNode = CurNode.children[CurChild]
634 CurChild = 0
635
636 Depth := len(stack)
637 if Depth > 32 {
638 Depth = 32
639 }
640 buf.Write(padSpace[:Depth])
641 buf.WriteString(CurNode.description())
642 buf.WriteRune('\n')
643 } else {
644 if len(stack) == 0 {
645 break
646 }
647
648 CurChild = stack[len(stack)-1]
649 stack = stack[:len(stack)-1]
650 CurNode = CurNode.next
651 }
652 }
653 return buf.String()
654}
diff --git a/vendor/github.com/dlclark/regexp2/syntax/writer.go b/vendor/github.com/dlclark/regexp2/syntax/writer.go
new file mode 100644
index 0000000..a5aa11c
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go
@@ -0,0 +1,500 @@
1package syntax
2
3import (
4 "bytes"
5 "fmt"
6 "math"
7 "os"
8)
9
10func Write(tree *RegexTree) (*Code, error) {
11 w := writer{
12 intStack: make([]int, 0, 32),
13 emitted: make([]int, 2),
14 stringhash: make(map[string]int),
15 sethash: make(map[string]int),
16 }
17
18 code, err := w.codeFromTree(tree)
19
20 if tree.options&Debug > 0 && code != nil {
21 os.Stdout.WriteString(code.Dump())
22 os.Stdout.WriteString("\n")
23 }
24
25 return code, err
26}
27
28type writer struct {
29 emitted []int
30
31 intStack []int
32 curpos int
33 stringhash map[string]int
34 stringtable [][]rune
35 sethash map[string]int
36 settable []*CharSet
37 counting bool
38 count int
39 trackcount int
40 caps map[int]int
41}
42
43const (
44 beforeChild nodeType = 64
45 afterChild = 128
46 //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
47 MaxPrefixSize = 50
48)
49
50// The top level RegexCode generator. It does a depth-first walk
51// through the tree and calls EmitFragment to emits code before
52// and after each child of an interior node, and at each leaf.
53//
54// It runs two passes, first to count the size of the generated
55// code, and second to generate the code.
56//
57// We should time it against the alternative, which is
58// to just generate the code and grow the array as we go.
59func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
60 var (
61 curNode *regexNode
62 curChild int
63 capsize int
64 )
65 // construct sparse capnum mapping if some numbers are unused
66
67 if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
68 capsize = tree.captop
69 w.caps = nil
70 } else {
71 capsize = len(tree.capnumlist)
72 w.caps = tree.caps
73 for i := 0; i < len(tree.capnumlist); i++ {
74 w.caps[tree.capnumlist[i]] = i
75 }
76 }
77
78 w.counting = true
79
80 for {
81 if !w.counting {
82 w.emitted = make([]int, w.count)
83 }
84
85 curNode = tree.root
86 curChild = 0
87
88 w.emit1(Lazybranch, 0)
89
90 for {
91 if len(curNode.children) == 0 {
92 w.emitFragment(curNode.t, curNode, 0)
93 } else if curChild < len(curNode.children) {
94 w.emitFragment(curNode.t|beforeChild, curNode, curChild)
95
96 curNode = curNode.children[curChild]
97
98 w.pushInt(curChild)
99 curChild = 0
100 continue
101 }
102
103 if w.emptyStack() {
104 break
105 }
106
107 curChild = w.popInt()
108 curNode = curNode.next
109
110 w.emitFragment(curNode.t|afterChild, curNode, curChild)
111 curChild++
112 }
113
114 w.patchJump(0, w.curPos())
115 w.emit(Stop)
116
117 if !w.counting {
118 break
119 }
120
121 w.counting = false
122 }
123
124 fcPrefix := getFirstCharsPrefix(tree)
125 prefix := getPrefix(tree)
126 rtl := (tree.options & RightToLeft) != 0
127
128 var bmPrefix *BmPrefix
129 //TODO: benchmark string prefixes
130 if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
131 if len(prefix.PrefixStr) > MaxPrefixSize {
132 // limit prefix changes to 10k
133 prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
134 }
135 bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
136 } else {
137 bmPrefix = nil
138 }
139
140 return &Code{
141 Codes: w.emitted,
142 Strings: w.stringtable,
143 Sets: w.settable,
144 TrackCount: w.trackcount,
145 Caps: w.caps,
146 Capsize: capsize,
147 FcPrefix: fcPrefix,
148 BmPrefix: bmPrefix,
149 Anchors: getAnchors(tree),
150 RightToLeft: rtl,
151 }, nil
152}
153
154// The main RegexCode generator. It does a depth-first walk
155// through the tree and calls EmitFragment to emits code before
156// and after each child of an interior node, and at each leaf.
157func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
158 bits := InstOp(0)
159
160 if nodetype <= ntRef {
161 if (node.options & RightToLeft) != 0 {
162 bits |= Rtl
163 }
164 if (node.options & IgnoreCase) != 0 {
165 bits |= Ci
166 }
167 }
168 ntBits := nodeType(bits)
169
170 switch nodetype {
171 case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
172 break
173
174 case ntAlternate | beforeChild:
175 if curIndex < len(node.children)-1 {
176 w.pushInt(w.curPos())
177 w.emit1(Lazybranch, 0)
178 }
179
180 case ntAlternate | afterChild:
181 if curIndex < len(node.children)-1 {
182 lbPos := w.popInt()
183 w.pushInt(w.curPos())
184 w.emit1(Goto, 0)
185 w.patchJump(lbPos, w.curPos())
186 } else {
187 for i := 0; i < curIndex; i++ {
188 w.patchJump(w.popInt(), w.curPos())
189 }
190 }
191 break
192
193 case ntTestref | beforeChild:
194 if curIndex == 0 {
195 w.emit(Setjump)
196 w.pushInt(w.curPos())
197 w.emit1(Lazybranch, 0)
198 w.emit1(Testref, w.mapCapnum(node.m))
199 w.emit(Forejump)
200 }
201
202 case ntTestref | afterChild:
203 if curIndex == 0 {
204 branchpos := w.popInt()
205 w.pushInt(w.curPos())
206 w.emit1(Goto, 0)
207 w.patchJump(branchpos, w.curPos())
208 w.emit(Forejump)
209 if len(node.children) <= 1 {
210 w.patchJump(w.popInt(), w.curPos())
211 }
212 } else if curIndex == 1 {
213 w.patchJump(w.popInt(), w.curPos())
214 }
215
216 case ntTestgroup | beforeChild:
217 if curIndex == 0 {
218 w.emit(Setjump)
219 w.emit(Setmark)
220 w.pushInt(w.curPos())
221 w.emit1(Lazybranch, 0)
222 }
223
224 case ntTestgroup | afterChild:
225 if curIndex == 0 {
226 w.emit(Getmark)
227 w.emit(Forejump)
228 } else if curIndex == 1 {
229 Branchpos := w.popInt()
230 w.pushInt(w.curPos())
231 w.emit1(Goto, 0)
232 w.patchJump(Branchpos, w.curPos())
233 w.emit(Getmark)
234 w.emit(Forejump)
235 if len(node.children) <= 2 {
236 w.patchJump(w.popInt(), w.curPos())
237 }
238 } else if curIndex == 2 {
239 w.patchJump(w.popInt(), w.curPos())
240 }
241
242 case ntLoop | beforeChild, ntLazyloop | beforeChild:
243
244 if node.n < math.MaxInt32 || node.m > 1 {
245 if node.m == 0 {
246 w.emit1(Nullcount, 0)
247 } else {
248 w.emit1(Setcount, 1-node.m)
249 }
250 } else if node.m == 0 {
251 w.emit(Nullmark)
252 } else {
253 w.emit(Setmark)
254 }
255
256 if node.m == 0 {
257 w.pushInt(w.curPos())
258 w.emit1(Goto, 0)
259 }
260 w.pushInt(w.curPos())
261
262 case ntLoop | afterChild, ntLazyloop | afterChild:
263
264 startJumpPos := w.curPos()
265 lazy := (nodetype - (ntLoop | afterChild))
266
267 if node.n < math.MaxInt32 || node.m > 1 {
268 if node.n == math.MaxInt32 {
269 w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
270 } else {
271 w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
272 }
273 } else {
274 w.emit1(InstOp(Branchmark+lazy), w.popInt())
275 }
276
277 if node.m == 0 {
278 w.patchJump(w.popInt(), startJumpPos)
279 }
280
281 case ntGroup | beforeChild, ntGroup | afterChild:
282
283 case ntCapture | beforeChild:
284 w.emit(Setmark)
285
286 case ntCapture | afterChild:
287 w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
288
289 case ntRequire | beforeChild:
290 // NOTE: the following line causes lookahead/lookbehind to be
291 // NON-BACKTRACKING. It can be commented out with (*)
292 w.emit(Setjump)
293
294 w.emit(Setmark)
295
296 case ntRequire | afterChild:
297 w.emit(Getmark)
298
299 // NOTE: the following line causes lookahead/lookbehind to be
300 // NON-BACKTRACKING. It can be commented out with (*)
301 w.emit(Forejump)
302
303 case ntPrevent | beforeChild:
304 w.emit(Setjump)
305 w.pushInt(w.curPos())
306 w.emit1(Lazybranch, 0)
307
308 case ntPrevent | afterChild:
309 w.emit(Backjump)
310 w.patchJump(w.popInt(), w.curPos())
311 w.emit(Forejump)
312
313 case ntGreedy | beforeChild:
314 w.emit(Setjump)
315
316 case ntGreedy | afterChild:
317 w.emit(Forejump)
318
319 case ntOne, ntNotone:
320 w.emit1(InstOp(node.t|ntBits), int(node.ch))
321
322 case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
323 if node.m > 0 {
324 if node.t == ntOneloop || node.t == ntOnelazy {
325 w.emit2(Onerep|bits, int(node.ch), node.m)
326 } else {
327 w.emit2(Notonerep|bits, int(node.ch), node.m)
328 }
329 }
330 if node.n > node.m {
331 if node.n == math.MaxInt32 {
332 w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
333 } else {
334 w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
335 }
336 }
337
338 case ntSetloop, ntSetlazy:
339 if node.m > 0 {
340 w.emit2(Setrep|bits, w.setCode(node.set), node.m)
341 }
342 if node.n > node.m {
343 if node.n == math.MaxInt32 {
344 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
345 } else {
346 w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
347 }
348 }
349
350 case ntMulti:
351 w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
352
353 case ntSet:
354 w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
355
356 case ntRef:
357 w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
358
359 case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
360 w.emit(InstOp(node.t))
361
362 default:
363 return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
364 }
365
366 return nil
367}
368
369// To avoid recursion, we use a simple integer stack.
370// This is the push.
371func (w *writer) pushInt(i int) {
372 w.intStack = append(w.intStack, i)
373}
374
375// Returns true if the stack is empty.
376func (w *writer) emptyStack() bool {
377 return len(w.intStack) == 0
378}
379
380// This is the pop.
381func (w *writer) popInt() int {
382 //get our item
383 idx := len(w.intStack) - 1
384 i := w.intStack[idx]
385 //trim our slice
386 w.intStack = w.intStack[:idx]
387 return i
388}
389
390// Returns the current position in the emitted code.
391func (w *writer) curPos() int {
392 return w.curpos
393}
394
395// Fixes up a jump instruction at the specified offset
396// so that it jumps to the specified jumpDest.
397func (w *writer) patchJump(offset, jumpDest int) {
398 w.emitted[offset+1] = jumpDest
399}
400
401// Returns an index in the set table for a charset
402// uses a map to eliminate duplicates.
403func (w *writer) setCode(set *CharSet) int {
404 if w.counting {
405 return 0
406 }
407
408 buf := &bytes.Buffer{}
409
410 set.mapHashFill(buf)
411 hash := buf.String()
412 i, ok := w.sethash[hash]
413 if !ok {
414 i = len(w.sethash)
415 w.sethash[hash] = i
416 w.settable = append(w.settable, set)
417 }
418 return i
419}
420
421// Returns an index in the string table for a string.
422// uses a map to eliminate duplicates.
423func (w *writer) stringCode(str []rune) int {
424 if w.counting {
425 return 0
426 }
427
428 hash := string(str)
429 i, ok := w.stringhash[hash]
430 if !ok {
431 i = len(w.stringhash)
432 w.stringhash[hash] = i
433 w.stringtable = append(w.stringtable, str)
434 }
435
436 return i
437}
438
439// When generating code on a regex that uses a sparse set
440// of capture slots, we hash them to a dense set of indices
441// for an array of capture slots. Instead of doing the hash
442// at match time, it's done at compile time, here.
443func (w *writer) mapCapnum(capnum int) int {
444 if capnum == -1 {
445 return -1
446 }
447
448 if w.caps != nil {
449 return w.caps[capnum]
450 }
451
452 return capnum
453}
454
455// Emits a zero-argument operation. Note that the emit
456// functions all run in two modes: they can emit code, or
457// they can just count the size of the code.
458func (w *writer) emit(op InstOp) {
459 if w.counting {
460 w.count++
461 if opcodeBacktracks(op) {
462 w.trackcount++
463 }
464 return
465 }
466 w.emitted[w.curpos] = int(op)
467 w.curpos++
468}
469
470// Emits a one-argument operation.
471func (w *writer) emit1(op InstOp, opd1 int) {
472 if w.counting {
473 w.count += 2
474 if opcodeBacktracks(op) {
475 w.trackcount++
476 }
477 return
478 }
479 w.emitted[w.curpos] = int(op)
480 w.curpos++
481 w.emitted[w.curpos] = opd1
482 w.curpos++
483}
484
485// Emits a two-argument operation.
486func (w *writer) emit2(op InstOp, opd1, opd2 int) {
487 if w.counting {
488 w.count += 3
489 if opcodeBacktracks(op) {
490 w.trackcount++
491 }
492 return
493 }
494 w.emitted[w.curpos] = int(op)
495 w.curpos++
496 w.emitted[w.curpos] = opd1
497 w.curpos++
498 w.emitted[w.curpos] = opd2
499 w.curpos++
500}