diff options
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax')
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/charclass.go | 865 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/code.go | 274 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/escape.go | 94 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/fuzz.go | 20 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/parser.go | 2251 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/prefix.go | 896 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/replacerdata.go | 87 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/tree.go | 654 | ||||
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/writer.go | 500 |
9 files changed, 5641 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/charclass.go b/vendor/github.com/dlclark/regexp2/syntax/charclass.go new file mode 100644 index 0000000..6881a0e --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/charclass.go | |||
| @@ -0,0 +1,865 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "encoding/binary" | ||
| 6 | "fmt" | ||
| 7 | "sort" | ||
| 8 | "unicode" | ||
| 9 | "unicode/utf8" | ||
| 10 | ) | ||
| 11 | |||
| 12 | // CharSet combines start-end rune ranges and unicode categories representing a set of characters | ||
| 13 | type CharSet struct { | ||
| 14 | ranges []singleRange | ||
| 15 | categories []category | ||
| 16 | sub *CharSet //optional subtractor | ||
| 17 | negate bool | ||
| 18 | anything bool | ||
| 19 | } | ||
| 20 | |||
| 21 | type category struct { | ||
| 22 | negate bool | ||
| 23 | cat string | ||
| 24 | } | ||
| 25 | |||
| 26 | type singleRange struct { | ||
| 27 | first rune | ||
| 28 | last rune | ||
| 29 | } | ||
| 30 | |||
| 31 | const ( | ||
| 32 | spaceCategoryText = " " | ||
| 33 | wordCategoryText = "W" | ||
| 34 | ) | ||
| 35 | |||
| 36 | var ( | ||
| 37 | ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00} | ||
| 38 | ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b} | ||
| 39 | ecmaDigit = []rune{0x0030, 0x003a} | ||
| 40 | |||
| 41 | re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021} | ||
| 42 | ) | ||
| 43 | |||
| 44 | var ( | ||
| 45 | AnyClass = getCharSetFromOldString([]rune{0}, false) | ||
| 46 | ECMAAnyClass = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false) | ||
| 47 | NoneClass = getCharSetFromOldString(nil, false) | ||
| 48 | ECMAWordClass = getCharSetFromOldString(ecmaWord, false) | ||
| 49 | NotECMAWordClass = getCharSetFromOldString(ecmaWord, true) | ||
| 50 | ECMASpaceClass = getCharSetFromOldString(ecmaSpace, false) | ||
| 51 | NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true) | ||
| 52 | ECMADigitClass = getCharSetFromOldString(ecmaDigit, false) | ||
| 53 | NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true) | ||
| 54 | |||
| 55 | WordClass = getCharSetFromCategoryString(false, false, wordCategoryText) | ||
| 56 | NotWordClass = getCharSetFromCategoryString(true, false, wordCategoryText) | ||
| 57 | SpaceClass = getCharSetFromCategoryString(false, false, spaceCategoryText) | ||
| 58 | NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText) | ||
| 59 | DigitClass = getCharSetFromCategoryString(false, false, "Nd") | ||
| 60 | NotDigitClass = getCharSetFromCategoryString(false, true, "Nd") | ||
| 61 | |||
| 62 | RE2SpaceClass = getCharSetFromOldString(re2Space, false) | ||
| 63 | NotRE2SpaceClass = getCharSetFromOldString(re2Space, true) | ||
| 64 | ) | ||
| 65 | |||
| 66 | var unicodeCategories = func() map[string]*unicode.RangeTable { | ||
| 67 | retVal := make(map[string]*unicode.RangeTable) | ||
| 68 | for k, v := range unicode.Scripts { | ||
| 69 | retVal[k] = v | ||
| 70 | } | ||
| 71 | for k, v := range unicode.Categories { | ||
| 72 | retVal[k] = v | ||
| 73 | } | ||
| 74 | for k, v := range unicode.Properties { | ||
| 75 | retVal[k] = v | ||
| 76 | } | ||
| 77 | return retVal | ||
| 78 | }() | ||
| 79 | |||
| 80 | func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet { | ||
| 81 | if negateCat && negateSet { | ||
| 82 | panic("BUG! You should only negate the set OR the category in a constant setup, but not both") | ||
| 83 | } | ||
| 84 | |||
| 85 | c := CharSet{negate: negateSet} | ||
| 86 | |||
| 87 | c.categories = make([]category, len(cats)) | ||
| 88 | for i, cat := range cats { | ||
| 89 | c.categories[i] = category{cat: cat, negate: negateCat} | ||
| 90 | } | ||
| 91 | return func() *CharSet { | ||
| 92 | //make a copy each time | ||
| 93 | local := c | ||
| 94 | //return that address | ||
| 95 | return &local | ||
| 96 | } | ||
| 97 | } | ||
| 98 | |||
| 99 | func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet { | ||
| 100 | c := CharSet{} | ||
| 101 | if len(setText) > 0 { | ||
| 102 | fillFirst := false | ||
| 103 | l := len(setText) | ||
| 104 | if negate { | ||
| 105 | if setText[0] == 0 { | ||
| 106 | setText = setText[1:] | ||
| 107 | } else { | ||
| 108 | l++ | ||
| 109 | fillFirst = true | ||
| 110 | } | ||
| 111 | } | ||
| 112 | |||
| 113 | if l%2 == 0 { | ||
| 114 | c.ranges = make([]singleRange, l/2) | ||
| 115 | } else { | ||
| 116 | c.ranges = make([]singleRange, l/2+1) | ||
| 117 | } | ||
| 118 | |||
| 119 | first := true | ||
| 120 | if fillFirst { | ||
| 121 | c.ranges[0] = singleRange{first: 0} | ||
| 122 | first = false | ||
| 123 | } | ||
| 124 | |||
| 125 | i := 0 | ||
| 126 | for _, r := range setText { | ||
| 127 | if first { | ||
| 128 | // lower bound in a new range | ||
| 129 | c.ranges[i] = singleRange{first: r} | ||
| 130 | first = false | ||
| 131 | } else { | ||
| 132 | c.ranges[i].last = r - 1 | ||
| 133 | i++ | ||
| 134 | first = true | ||
| 135 | } | ||
| 136 | } | ||
| 137 | if !first { | ||
| 138 | c.ranges[i].last = utf8.MaxRune | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | return func() *CharSet { | ||
| 143 | local := c | ||
| 144 | return &local | ||
| 145 | } | ||
| 146 | } | ||
| 147 | |||
| 148 | // Copy makes a deep copy to prevent accidental mutation of a set | ||
| 149 | func (c CharSet) Copy() CharSet { | ||
| 150 | ret := CharSet{ | ||
| 151 | anything: c.anything, | ||
| 152 | negate: c.negate, | ||
| 153 | } | ||
| 154 | |||
| 155 | ret.ranges = append(ret.ranges, c.ranges...) | ||
| 156 | ret.categories = append(ret.categories, c.categories...) | ||
| 157 | |||
| 158 | if c.sub != nil { | ||
| 159 | sub := c.sub.Copy() | ||
| 160 | ret.sub = &sub | ||
| 161 | } | ||
| 162 | |||
| 163 | return ret | ||
| 164 | } | ||
| 165 | |||
| 166 | // gets a human-readable description for a set string | ||
| 167 | func (c CharSet) String() string { | ||
| 168 | buf := &bytes.Buffer{} | ||
| 169 | buf.WriteRune('[') | ||
| 170 | |||
| 171 | if c.IsNegated() { | ||
| 172 | buf.WriteRune('^') | ||
| 173 | } | ||
| 174 | |||
| 175 | for _, r := range c.ranges { | ||
| 176 | |||
| 177 | buf.WriteString(CharDescription(r.first)) | ||
| 178 | if r.first != r.last { | ||
| 179 | if r.last-r.first != 1 { | ||
| 180 | //groups that are 1 char apart skip the dash | ||
| 181 | buf.WriteRune('-') | ||
| 182 | } | ||
| 183 | buf.WriteString(CharDescription(r.last)) | ||
| 184 | } | ||
| 185 | } | ||
| 186 | |||
| 187 | for _, c := range c.categories { | ||
| 188 | buf.WriteString(c.String()) | ||
| 189 | } | ||
| 190 | |||
| 191 | if c.sub != nil { | ||
| 192 | buf.WriteRune('-') | ||
| 193 | buf.WriteString(c.sub.String()) | ||
| 194 | } | ||
| 195 | |||
| 196 | buf.WriteRune(']') | ||
| 197 | |||
| 198 | return buf.String() | ||
| 199 | } | ||
| 200 | |||
| 201 | // mapHashFill converts a charset into a buffer for use in maps | ||
| 202 | func (c CharSet) mapHashFill(buf *bytes.Buffer) { | ||
| 203 | if c.negate { | ||
| 204 | buf.WriteByte(0) | ||
| 205 | } else { | ||
| 206 | buf.WriteByte(1) | ||
| 207 | } | ||
| 208 | |||
| 209 | binary.Write(buf, binary.LittleEndian, len(c.ranges)) | ||
| 210 | binary.Write(buf, binary.LittleEndian, len(c.categories)) | ||
| 211 | for _, r := range c.ranges { | ||
| 212 | buf.WriteRune(r.first) | ||
| 213 | buf.WriteRune(r.last) | ||
| 214 | } | ||
| 215 | for _, ct := range c.categories { | ||
| 216 | buf.WriteString(ct.cat) | ||
| 217 | if ct.negate { | ||
| 218 | buf.WriteByte(1) | ||
| 219 | } else { | ||
| 220 | buf.WriteByte(0) | ||
| 221 | } | ||
| 222 | } | ||
| 223 | |||
| 224 | if c.sub != nil { | ||
| 225 | c.sub.mapHashFill(buf) | ||
| 226 | } | ||
| 227 | } | ||
| 228 | |||
| 229 | // CharIn returns true if the rune is in our character set (either ranges or categories). | ||
| 230 | // It handles negations and subtracted sub-charsets. | ||
| 231 | func (c CharSet) CharIn(ch rune) bool { | ||
| 232 | val := false | ||
| 233 | // in s && !s.subtracted | ||
| 234 | |||
| 235 | //check ranges | ||
| 236 | for _, r := range c.ranges { | ||
| 237 | if ch < r.first { | ||
| 238 | continue | ||
| 239 | } | ||
| 240 | if ch <= r.last { | ||
| 241 | val = true | ||
| 242 | break | ||
| 243 | } | ||
| 244 | } | ||
| 245 | |||
| 246 | //check categories if we haven't already found a range | ||
| 247 | if !val && len(c.categories) > 0 { | ||
| 248 | for _, ct := range c.categories { | ||
| 249 | // special categories...then unicode | ||
| 250 | if ct.cat == spaceCategoryText { | ||
| 251 | if unicode.IsSpace(ch) { | ||
| 252 | // we found a space so we're done | ||
| 253 | // negate means this is a "bad" thing | ||
| 254 | val = !ct.negate | ||
| 255 | break | ||
| 256 | } else if ct.negate { | ||
| 257 | val = true | ||
| 258 | break | ||
| 259 | } | ||
| 260 | } else if ct.cat == wordCategoryText { | ||
| 261 | if IsWordChar(ch) { | ||
| 262 | val = !ct.negate | ||
| 263 | break | ||
| 264 | } else if ct.negate { | ||
| 265 | val = true | ||
| 266 | break | ||
| 267 | } | ||
| 268 | } else if unicode.Is(unicodeCategories[ct.cat], ch) { | ||
| 269 | // if we're in this unicode category then we're done | ||
| 270 | // if negate=true on this category then we "failed" our test | ||
| 271 | // otherwise we're good that we found it | ||
| 272 | val = !ct.negate | ||
| 273 | break | ||
| 274 | } else if ct.negate { | ||
| 275 | val = true | ||
| 276 | break | ||
| 277 | } | ||
| 278 | } | ||
| 279 | } | ||
| 280 | |||
| 281 | // negate the whole char set | ||
| 282 | if c.negate { | ||
| 283 | val = !val | ||
| 284 | } | ||
| 285 | |||
| 286 | // get subtracted recurse | ||
| 287 | if val && c.sub != nil { | ||
| 288 | val = !c.sub.CharIn(ch) | ||
| 289 | } | ||
| 290 | |||
| 291 | //log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val) | ||
| 292 | return val | ||
| 293 | } | ||
| 294 | |||
| 295 | func (c category) String() string { | ||
| 296 | switch c.cat { | ||
| 297 | case spaceCategoryText: | ||
| 298 | if c.negate { | ||
| 299 | return "\\S" | ||
| 300 | } | ||
| 301 | return "\\s" | ||
| 302 | case wordCategoryText: | ||
| 303 | if c.negate { | ||
| 304 | return "\\W" | ||
| 305 | } | ||
| 306 | return "\\w" | ||
| 307 | } | ||
| 308 | if _, ok := unicodeCategories[c.cat]; ok { | ||
| 309 | |||
| 310 | if c.negate { | ||
| 311 | return "\\P{" + c.cat + "}" | ||
| 312 | } | ||
| 313 | return "\\p{" + c.cat + "}" | ||
| 314 | } | ||
| 315 | return "Unknown category: " + c.cat | ||
| 316 | } | ||
| 317 | |||
| 318 | // CharDescription Produces a human-readable description for a single character. | ||
| 319 | func CharDescription(ch rune) string { | ||
| 320 | /*if ch == '\\' { | ||
| 321 | return "\\\\" | ||
| 322 | } | ||
| 323 | |||
| 324 | if ch > ' ' && ch <= '~' { | ||
| 325 | return string(ch) | ||
| 326 | } else if ch == '\n' { | ||
| 327 | return "\\n" | ||
| 328 | } else if ch == ' ' { | ||
| 329 | return "\\ " | ||
| 330 | }*/ | ||
| 331 | |||
| 332 | b := &bytes.Buffer{} | ||
| 333 | escape(b, ch, false) //fmt.Sprintf("%U", ch) | ||
| 334 | return b.String() | ||
| 335 | } | ||
| 336 | |||
| 337 | // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) | ||
| 338 | // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic | ||
| 339 | // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C | ||
| 340 | // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. | ||
| 341 | func IsWordChar(r rune) bool { | ||
| 342 | //"L", "Mn", "Nd", "Pc" | ||
| 343 | return unicode.In(r, | ||
| 344 | unicode.Categories["L"], unicode.Categories["Mn"], | ||
| 345 | unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C' | ||
| 346 | //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' | ||
| 347 | } | ||
| 348 | |||
| 349 | func IsECMAWordChar(r rune) bool { | ||
| 350 | return unicode.In(r, | ||
| 351 | unicode.Categories["L"], unicode.Categories["Mn"], | ||
| 352 | unicode.Categories["Nd"], unicode.Categories["Pc"]) | ||
| 353 | |||
| 354 | //return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' | ||
| 355 | } | ||
| 356 | |||
| 357 | // SingletonChar will return the char from the first range without validation. | ||
| 358 | // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input | ||
| 359 | func (c CharSet) SingletonChar() rune { | ||
| 360 | return c.ranges[0].first | ||
| 361 | } | ||
| 362 | |||
| 363 | func (c CharSet) IsSingleton() bool { | ||
| 364 | return !c.negate && //negated is multiple chars | ||
| 365 | len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars | ||
| 366 | c.sub == nil && // subtraction means we've got multiple chars | ||
| 367 | c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char | ||
| 368 | } | ||
| 369 | |||
| 370 | func (c CharSet) IsSingletonInverse() bool { | ||
| 371 | return c.negate && //same as above, but requires negated | ||
| 372 | len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars | ||
| 373 | c.sub == nil && // subtraction means we've got multiple chars | ||
| 374 | c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char | ||
| 375 | } | ||
| 376 | |||
| 377 | func (c CharSet) IsMergeable() bool { | ||
| 378 | return !c.IsNegated() && !c.HasSubtraction() | ||
| 379 | } | ||
| 380 | |||
| 381 | func (c CharSet) IsNegated() bool { | ||
| 382 | return c.negate | ||
| 383 | } | ||
| 384 | |||
| 385 | func (c CharSet) HasSubtraction() bool { | ||
| 386 | return c.sub != nil | ||
| 387 | } | ||
| 388 | |||
| 389 | func (c CharSet) IsEmpty() bool { | ||
| 390 | return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil | ||
| 391 | } | ||
| 392 | |||
| 393 | func (c *CharSet) addDigit(ecma, negate bool, pattern string) { | ||
| 394 | if ecma { | ||
| 395 | if negate { | ||
| 396 | c.addRanges(NotECMADigitClass().ranges) | ||
| 397 | } else { | ||
| 398 | c.addRanges(ECMADigitClass().ranges) | ||
| 399 | } | ||
| 400 | } else { | ||
| 401 | c.addCategories(category{cat: "Nd", negate: negate}) | ||
| 402 | } | ||
| 403 | } | ||
| 404 | |||
| 405 | func (c *CharSet) addChar(ch rune) { | ||
| 406 | c.addRange(ch, ch) | ||
| 407 | } | ||
| 408 | |||
| 409 | func (c *CharSet) addSpace(ecma, re2, negate bool) { | ||
| 410 | if ecma { | ||
| 411 | if negate { | ||
| 412 | c.addRanges(NotECMASpaceClass().ranges) | ||
| 413 | } else { | ||
| 414 | c.addRanges(ECMASpaceClass().ranges) | ||
| 415 | } | ||
| 416 | } else if re2 { | ||
| 417 | if negate { | ||
| 418 | c.addRanges(NotRE2SpaceClass().ranges) | ||
| 419 | } else { | ||
| 420 | c.addRanges(RE2SpaceClass().ranges) | ||
| 421 | } | ||
| 422 | } else { | ||
| 423 | c.addCategories(category{cat: spaceCategoryText, negate: negate}) | ||
| 424 | } | ||
| 425 | } | ||
| 426 | |||
| 427 | func (c *CharSet) addWord(ecma, negate bool) { | ||
| 428 | if ecma { | ||
| 429 | if negate { | ||
| 430 | c.addRanges(NotECMAWordClass().ranges) | ||
| 431 | } else { | ||
| 432 | c.addRanges(ECMAWordClass().ranges) | ||
| 433 | } | ||
| 434 | } else { | ||
| 435 | c.addCategories(category{cat: wordCategoryText, negate: negate}) | ||
| 436 | } | ||
| 437 | } | ||
| 438 | |||
| 439 | // Add set ranges and categories into ours -- no deduping or anything | ||
| 440 | func (c *CharSet) addSet(set CharSet) { | ||
| 441 | if c.anything { | ||
| 442 | return | ||
| 443 | } | ||
| 444 | if set.anything { | ||
| 445 | c.makeAnything() | ||
| 446 | return | ||
| 447 | } | ||
| 448 | // just append here to prevent double-canon | ||
| 449 | c.ranges = append(c.ranges, set.ranges...) | ||
| 450 | c.addCategories(set.categories...) | ||
| 451 | c.canonicalize() | ||
| 452 | } | ||
| 453 | |||
| 454 | func (c *CharSet) makeAnything() { | ||
| 455 | c.anything = true | ||
| 456 | c.categories = []category{} | ||
| 457 | c.ranges = AnyClass().ranges | ||
| 458 | } | ||
| 459 | |||
| 460 | func (c *CharSet) addCategories(cats ...category) { | ||
| 461 | // don't add dupes and remove positive+negative | ||
| 462 | if c.anything { | ||
| 463 | // if we've had a previous positive+negative group then | ||
| 464 | // just return, we're as broad as we can get | ||
| 465 | return | ||
| 466 | } | ||
| 467 | |||
| 468 | for _, ct := range cats { | ||
| 469 | found := false | ||
| 470 | for _, ct2 := range c.categories { | ||
| 471 | if ct.cat == ct2.cat { | ||
| 472 | if ct.negate != ct2.negate { | ||
| 473 | // oposite negations...this mean we just | ||
| 474 | // take us as anything and move on | ||
| 475 | c.makeAnything() | ||
| 476 | return | ||
| 477 | } | ||
| 478 | found = true | ||
| 479 | break | ||
| 480 | } | ||
| 481 | } | ||
| 482 | |||
| 483 | if !found { | ||
| 484 | c.categories = append(c.categories, ct) | ||
| 485 | } | ||
| 486 | } | ||
| 487 | } | ||
| 488 | |||
| 489 | // Merges new ranges to our own | ||
| 490 | func (c *CharSet) addRanges(ranges []singleRange) { | ||
| 491 | if c.anything { | ||
| 492 | return | ||
| 493 | } | ||
| 494 | c.ranges = append(c.ranges, ranges...) | ||
| 495 | c.canonicalize() | ||
| 496 | } | ||
| 497 | |||
| 498 | // Merges everything but the new ranges into our own | ||
| 499 | func (c *CharSet) addNegativeRanges(ranges []singleRange) { | ||
| 500 | if c.anything { | ||
| 501 | return | ||
| 502 | } | ||
| 503 | |||
| 504 | var hi rune | ||
| 505 | |||
| 506 | // convert incoming ranges into opposites, assume they are in order | ||
| 507 | for _, r := range ranges { | ||
| 508 | if hi < r.first { | ||
| 509 | c.ranges = append(c.ranges, singleRange{hi, r.first - 1}) | ||
| 510 | } | ||
| 511 | hi = r.last + 1 | ||
| 512 | } | ||
| 513 | |||
| 514 | if hi < utf8.MaxRune { | ||
| 515 | c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune}) | ||
| 516 | } | ||
| 517 | |||
| 518 | c.canonicalize() | ||
| 519 | } | ||
| 520 | |||
| 521 | func isValidUnicodeCat(catName string) bool { | ||
| 522 | _, ok := unicodeCategories[catName] | ||
| 523 | return ok | ||
| 524 | } | ||
| 525 | |||
| 526 | func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) { | ||
| 527 | if !isValidUnicodeCat(categoryName) { | ||
| 528 | // unknown unicode category, script, or property "blah" | ||
| 529 | panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName)) | ||
| 530 | |||
| 531 | } | ||
| 532 | |||
| 533 | if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") { | ||
| 534 | // when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match | ||
| 535 | c.addCategories( | ||
| 536 | category{cat: "Ll", negate: negate}, | ||
| 537 | category{cat: "Lu", negate: negate}, | ||
| 538 | category{cat: "Lt", negate: negate}) | ||
| 539 | } | ||
| 540 | c.addCategories(category{cat: categoryName, negate: negate}) | ||
| 541 | } | ||
| 542 | |||
| 543 | func (c *CharSet) addSubtraction(sub *CharSet) { | ||
| 544 | c.sub = sub | ||
| 545 | } | ||
| 546 | |||
| 547 | func (c *CharSet) addRange(chMin, chMax rune) { | ||
| 548 | c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax}) | ||
| 549 | c.canonicalize() | ||
| 550 | } | ||
| 551 | |||
| 552 | func (c *CharSet) addNamedASCII(name string, negate bool) bool { | ||
| 553 | var rs []singleRange | ||
| 554 | |||
| 555 | switch name { | ||
| 556 | case "alnum": | ||
| 557 | rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}} | ||
| 558 | case "alpha": | ||
| 559 | rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}} | ||
| 560 | case "ascii": | ||
| 561 | rs = []singleRange{singleRange{0, 0x7f}} | ||
| 562 | case "blank": | ||
| 563 | rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}} | ||
| 564 | case "cntrl": | ||
| 565 | rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}} | ||
| 566 | case "digit": | ||
| 567 | c.addDigit(false, negate, "") | ||
| 568 | case "graph": | ||
| 569 | rs = []singleRange{singleRange{'!', '~'}} | ||
| 570 | case "lower": | ||
| 571 | rs = []singleRange{singleRange{'a', 'z'}} | ||
| 572 | case "print": | ||
| 573 | rs = []singleRange{singleRange{' ', '~'}} | ||
| 574 | case "punct": //[!-/:-@[-`{-~] | ||
| 575 | rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}} | ||
| 576 | case "space": | ||
| 577 | c.addSpace(true, false, negate) | ||
| 578 | case "upper": | ||
| 579 | rs = []singleRange{singleRange{'A', 'Z'}} | ||
| 580 | case "word": | ||
| 581 | c.addWord(true, negate) | ||
| 582 | case "xdigit": | ||
| 583 | rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}} | ||
| 584 | default: | ||
| 585 | return false | ||
| 586 | } | ||
| 587 | |||
| 588 | if len(rs) > 0 { | ||
| 589 | if negate { | ||
| 590 | c.addNegativeRanges(rs) | ||
| 591 | } else { | ||
| 592 | c.addRanges(rs) | ||
| 593 | } | ||
| 594 | } | ||
| 595 | |||
| 596 | return true | ||
| 597 | } | ||
| 598 | |||
| 599 | type singleRangeSorter []singleRange | ||
| 600 | |||
| 601 | func (p singleRangeSorter) Len() int { return len(p) } | ||
| 602 | func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first } | ||
| 603 | func (p singleRangeSorter) Swap(i, j int) { p[i], p[j] = p[j], p[i] } | ||
| 604 | |||
| 605 | // Logic to reduce a character class to a unique, sorted form. | ||
| 606 | func (c *CharSet) canonicalize() { | ||
| 607 | var i, j int | ||
| 608 | var last rune | ||
| 609 | |||
| 610 | // | ||
| 611 | // Find and eliminate overlapping or abutting ranges | ||
| 612 | // | ||
| 613 | |||
| 614 | if len(c.ranges) > 1 { | ||
| 615 | sort.Sort(singleRangeSorter(c.ranges)) | ||
| 616 | |||
| 617 | done := false | ||
| 618 | |||
| 619 | for i, j = 1, 0; ; i++ { | ||
| 620 | for last = c.ranges[j].last; ; i++ { | ||
| 621 | if i == len(c.ranges) || last == utf8.MaxRune { | ||
| 622 | done = true | ||
| 623 | break | ||
| 624 | } | ||
| 625 | |||
| 626 | CurrentRange := c.ranges[i] | ||
| 627 | if CurrentRange.first > last+1 { | ||
| 628 | break | ||
| 629 | } | ||
| 630 | |||
| 631 | if last < CurrentRange.last { | ||
| 632 | last = CurrentRange.last | ||
| 633 | } | ||
| 634 | } | ||
| 635 | |||
| 636 | c.ranges[j] = singleRange{first: c.ranges[j].first, last: last} | ||
| 637 | |||
| 638 | j++ | ||
| 639 | |||
| 640 | if done { | ||
| 641 | break | ||
| 642 | } | ||
| 643 | |||
| 644 | if j < i { | ||
| 645 | c.ranges[j] = c.ranges[i] | ||
| 646 | } | ||
| 647 | } | ||
| 648 | |||
| 649 | c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...) | ||
| 650 | } | ||
| 651 | } | ||
| 652 | |||
| 653 | // Adds to the class any lowercase versions of characters already | ||
| 654 | // in the class. Used for case-insensitivity. | ||
| 655 | func (c *CharSet) addLowercase() { | ||
| 656 | if c.anything { | ||
| 657 | return | ||
| 658 | } | ||
| 659 | toAdd := []singleRange{} | ||
| 660 | for i := 0; i < len(c.ranges); i++ { | ||
| 661 | r := c.ranges[i] | ||
| 662 | if r.first == r.last { | ||
| 663 | lower := unicode.ToLower(r.first) | ||
| 664 | c.ranges[i] = singleRange{first: lower, last: lower} | ||
| 665 | } else { | ||
| 666 | toAdd = append(toAdd, r) | ||
| 667 | } | ||
| 668 | } | ||
| 669 | |||
| 670 | for _, r := range toAdd { | ||
| 671 | c.addLowercaseRange(r.first, r.last) | ||
| 672 | } | ||
| 673 | c.canonicalize() | ||
| 674 | } | ||
| 675 | |||
| 676 | /************************************************************************** | ||
| 677 | Let U be the set of Unicode character values and let L be the lowercase | ||
| 678 | function, mapping from U to U. To perform case insensitive matching of | ||
| 679 | character sets, we need to be able to map an interval I in U, say | ||
| 680 | |||
| 681 | I = [chMin, chMax] = { ch : chMin <= ch <= chMax } | ||
| 682 | |||
| 683 | to a set A such that A contains L(I) and A is contained in the union of | ||
| 684 | I and L(I). | ||
| 685 | |||
| 686 | The table below partitions U into intervals on which L is non-decreasing. | ||
| 687 | Thus, for any interval J = [a, b] contained in one of these intervals, | ||
| 688 | L(J) is contained in [L(a), L(b)]. | ||
| 689 | |||
| 690 | It is also true that for any such J, [L(a), L(b)] is contained in the | ||
| 691 | union of J and L(J). This does not follow from L being non-decreasing on | ||
| 692 | these intervals. It follows from the nature of the L on each interval. | ||
| 693 | On each interval, L has one of the following forms: | ||
| 694 | |||
| 695 | (1) L(ch) = constant (LowercaseSet) | ||
| 696 | (2) L(ch) = ch + offset (LowercaseAdd) | ||
| 697 | (3) L(ch) = ch | 1 (LowercaseBor) | ||
| 698 | (4) L(ch) = ch + (ch & 1) (LowercaseBad) | ||
| 699 | |||
| 700 | It is easy to verify that for any of these forms [L(a), L(b)] is | ||
| 701 | contained in the union of [a, b] and L([a, b]). | ||
| 702 | ***************************************************************************/ | ||
| 703 | |||
| 704 | const ( | ||
| 705 | LowercaseSet = 0 // Set to arg. | ||
| 706 | LowercaseAdd = 1 // Add arg. | ||
| 707 | LowercaseBor = 2 // Bitwise or with 1. | ||
| 708 | LowercaseBad = 3 // Bitwise and with 1 and add original. | ||
| 709 | ) | ||
| 710 | |||
| 711 | type lcMap struct { | ||
| 712 | chMin, chMax rune | ||
| 713 | op, data int32 | ||
| 714 | } | ||
| 715 | |||
| 716 | var lcTable = []lcMap{ | ||
| 717 | lcMap{'\u0041', '\u005A', LowercaseAdd, 32}, | ||
| 718 | lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32}, | ||
| 719 | lcMap{'\u0100', '\u012E', LowercaseBor, 0}, | ||
| 720 | lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069}, | ||
| 721 | lcMap{'\u0132', '\u0136', LowercaseBor, 0}, | ||
| 722 | lcMap{'\u0139', '\u0147', LowercaseBad, 0}, | ||
| 723 | lcMap{'\u014A', '\u0176', LowercaseBor, 0}, | ||
| 724 | lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF}, | ||
| 725 | lcMap{'\u0179', '\u017D', LowercaseBad, 0}, | ||
| 726 | lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253}, | ||
| 727 | lcMap{'\u0182', '\u0184', LowercaseBor, 0}, | ||
| 728 | lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254}, | ||
| 729 | lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188}, | ||
| 730 | lcMap{'\u0189', '\u018A', LowercaseAdd, 205}, | ||
| 731 | lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C}, | ||
| 732 | lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD}, | ||
| 733 | lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259}, | ||
| 734 | lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B}, | ||
| 735 | lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192}, | ||
| 736 | lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260}, | ||
| 737 | lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263}, | ||
| 738 | lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269}, | ||
| 739 | lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268}, | ||
| 740 | lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199}, | ||
| 741 | lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F}, | ||
| 742 | lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272}, | ||
| 743 | lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275}, | ||
| 744 | lcMap{'\u01A0', '\u01A4', LowercaseBor, 0}, | ||
| 745 | lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8}, | ||
| 746 | lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283}, | ||
| 747 | lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD}, | ||
| 748 | lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288}, | ||
| 749 | lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0}, | ||
| 750 | lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217}, | ||
| 751 | lcMap{'\u01B3', '\u01B5', LowercaseBad, 0}, | ||
| 752 | lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292}, | ||
| 753 | lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9}, | ||
| 754 | lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD}, | ||
| 755 | lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6}, | ||
| 756 | lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9}, | ||
| 757 | lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC}, | ||
| 758 | lcMap{'\u01CD', '\u01DB', LowercaseBad, 0}, | ||
| 759 | lcMap{'\u01DE', '\u01EE', LowercaseBor, 0}, | ||
| 760 | lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3}, | ||
| 761 | lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5}, | ||
| 762 | lcMap{'\u01FA', '\u0216', LowercaseBor, 0}, | ||
| 763 | lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC}, | ||
| 764 | lcMap{'\u0388', '\u038A', LowercaseAdd, 37}, | ||
| 765 | lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC}, | ||
| 766 | lcMap{'\u038E', '\u038F', LowercaseAdd, 63}, | ||
| 767 | lcMap{'\u0391', '\u03AB', LowercaseAdd, 32}, | ||
| 768 | lcMap{'\u03E2', '\u03EE', LowercaseBor, 0}, | ||
| 769 | lcMap{'\u0401', '\u040F', LowercaseAdd, 80}, | ||
| 770 | lcMap{'\u0410', '\u042F', LowercaseAdd, 32}, | ||
| 771 | lcMap{'\u0460', '\u0480', LowercaseBor, 0}, | ||
| 772 | lcMap{'\u0490', '\u04BE', LowercaseBor, 0}, | ||
| 773 | lcMap{'\u04C1', '\u04C3', LowercaseBad, 0}, | ||
| 774 | lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8}, | ||
| 775 | lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC}, | ||
| 776 | lcMap{'\u04D0', '\u04EA', LowercaseBor, 0}, | ||
| 777 | lcMap{'\u04EE', '\u04F4', LowercaseBor, 0}, | ||
| 778 | lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9}, | ||
| 779 | lcMap{'\u0531', '\u0556', LowercaseAdd, 48}, | ||
| 780 | lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48}, | ||
| 781 | lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0}, | ||
| 782 | lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8}, | ||
| 783 | lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8}, | ||
| 784 | lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8}, | ||
| 785 | lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8}, | ||
| 786 | lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8}, | ||
| 787 | lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51}, | ||
| 788 | lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53}, | ||
| 789 | lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55}, | ||
| 790 | lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57}, | ||
| 791 | lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8}, | ||
| 792 | lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8}, | ||
| 793 | lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8}, | ||
| 794 | lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8}, | ||
| 795 | lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8}, | ||
| 796 | lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74}, | ||
| 797 | lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3}, | ||
| 798 | lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86}, | ||
| 799 | lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3}, | ||
| 800 | lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8}, | ||
| 801 | lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100}, | ||
| 802 | lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8}, | ||
| 803 | lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112}, | ||
| 804 | lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5}, | ||
| 805 | lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128}, | ||
| 806 | lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126}, | ||
| 807 | lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3}, | ||
| 808 | lcMap{'\u2160', '\u216F', LowercaseAdd, 16}, | ||
| 809 | lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26}, | ||
| 810 | lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32}, | ||
| 811 | } | ||
| 812 | |||
| 813 | func (c *CharSet) addLowercaseRange(chMin, chMax rune) { | ||
| 814 | var i, iMax, iMid int | ||
| 815 | var chMinT, chMaxT rune | ||
| 816 | var lc lcMap | ||
| 817 | |||
| 818 | for i, iMax = 0, len(lcTable); i < iMax; { | ||
| 819 | iMid = (i + iMax) / 2 | ||
| 820 | if lcTable[iMid].chMax < chMin { | ||
| 821 | i = iMid + 1 | ||
| 822 | } else { | ||
| 823 | iMax = iMid | ||
| 824 | } | ||
| 825 | } | ||
| 826 | |||
| 827 | for ; i < len(lcTable); i++ { | ||
| 828 | lc = lcTable[i] | ||
| 829 | if lc.chMin > chMax { | ||
| 830 | return | ||
| 831 | } | ||
| 832 | chMinT = lc.chMin | ||
| 833 | if chMinT < chMin { | ||
| 834 | chMinT = chMin | ||
| 835 | } | ||
| 836 | |||
| 837 | chMaxT = lc.chMax | ||
| 838 | if chMaxT > chMax { | ||
| 839 | chMaxT = chMax | ||
| 840 | } | ||
| 841 | |||
| 842 | switch lc.op { | ||
| 843 | case LowercaseSet: | ||
| 844 | chMinT = rune(lc.data) | ||
| 845 | chMaxT = rune(lc.data) | ||
| 846 | break | ||
| 847 | case LowercaseAdd: | ||
| 848 | chMinT += lc.data | ||
| 849 | chMaxT += lc.data | ||
| 850 | break | ||
| 851 | case LowercaseBor: | ||
| 852 | chMinT |= 1 | ||
| 853 | chMaxT |= 1 | ||
| 854 | break | ||
| 855 | case LowercaseBad: | ||
| 856 | chMinT += (chMinT & 1) | ||
| 857 | chMaxT += (chMaxT & 1) | ||
| 858 | break | ||
| 859 | } | ||
| 860 | |||
| 861 | if chMinT < chMin || chMaxT > chMax { | ||
| 862 | c.addRange(chMinT, chMaxT) | ||
| 863 | } | ||
| 864 | } | ||
| 865 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/code.go b/vendor/github.com/dlclark/regexp2/syntax/code.go new file mode 100644 index 0000000..686e822 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/code.go | |||
| @@ -0,0 +1,274 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "fmt" | ||
| 6 | "math" | ||
| 7 | ) | ||
| 8 | |||
| 9 | // similar to prog.go in the go regex package...also with comment 'may not belong in this package' | ||
| 10 | |||
| 11 | // File provides operator constants for use by the Builder and the Machine. | ||
| 12 | |||
| 13 | // Implementation notes: | ||
| 14 | // | ||
| 15 | // Regexps are built into RegexCodes, which contain an operation array, | ||
| 16 | // a string table, and some constants. | ||
| 17 | // | ||
| 18 | // Each operation is one of the codes below, followed by the integer | ||
| 19 | // operands specified for each op. | ||
| 20 | // | ||
| 21 | // Strings and sets are indices into a string table. | ||
| 22 | |||
| 23 | type InstOp int | ||
| 24 | |||
| 25 | const ( | ||
| 26 | // lef/back operands description | ||
| 27 | |||
| 28 | Onerep InstOp = 0 // lef,back char,min,max a {n} | ||
| 29 | Notonerep = 1 // lef,back char,min,max .{n} | ||
| 30 | Setrep = 2 // lef,back set,min,max [\d]{n} | ||
| 31 | |||
| 32 | Oneloop = 3 // lef,back char,min,max a {,n} | ||
| 33 | Notoneloop = 4 // lef,back char,min,max .{,n} | ||
| 34 | Setloop = 5 // lef,back set,min,max [\d]{,n} | ||
| 35 | |||
| 36 | Onelazy = 6 // lef,back char,min,max a {,n}? | ||
| 37 | Notonelazy = 7 // lef,back char,min,max .{,n}? | ||
| 38 | Setlazy = 8 // lef,back set,min,max [\d]{,n}? | ||
| 39 | |||
| 40 | One = 9 // lef char a | ||
| 41 | Notone = 10 // lef char [^a] | ||
| 42 | Set = 11 // lef set [a-z\s] \w \s \d | ||
| 43 | |||
| 44 | Multi = 12 // lef string abcd | ||
| 45 | Ref = 13 // lef group \# | ||
| 46 | |||
| 47 | Bol = 14 // ^ | ||
| 48 | Eol = 15 // $ | ||
| 49 | Boundary = 16 // \b | ||
| 50 | Nonboundary = 17 // \B | ||
| 51 | Beginning = 18 // \A | ||
| 52 | Start = 19 // \G | ||
| 53 | EndZ = 20 // \Z | ||
| 54 | End = 21 // \Z | ||
| 55 | |||
| 56 | Nothing = 22 // Reject! | ||
| 57 | |||
| 58 | // Primitive control structures | ||
| 59 | |||
| 60 | Lazybranch = 23 // back jump straight first | ||
| 61 | Branchmark = 24 // back jump branch first for loop | ||
| 62 | Lazybranchmark = 25 // back jump straight first for loop | ||
| 63 | Nullcount = 26 // back val set counter, null mark | ||
| 64 | Setcount = 27 // back val set counter, make mark | ||
| 65 | Branchcount = 28 // back jump,limit branch++ if zero<=c<limit | ||
| 66 | Lazybranchcount = 29 // back jump,limit same, but straight first | ||
| 67 | Nullmark = 30 // back save position | ||
| 68 | Setmark = 31 // back save position | ||
| 69 | Capturemark = 32 // back group define group | ||
| 70 | Getmark = 33 // back recall position | ||
| 71 | Setjump = 34 // back save backtrack state | ||
| 72 | Backjump = 35 // zap back to saved state | ||
| 73 | Forejump = 36 // zap backtracking state | ||
| 74 | Testref = 37 // backtrack if ref undefined | ||
| 75 | Goto = 38 // jump just go | ||
| 76 | |||
| 77 | Prune = 39 // prune it baby | ||
| 78 | Stop = 40 // done! | ||
| 79 | |||
| 80 | ECMABoundary = 41 // \b | ||
| 81 | NonECMABoundary = 42 // \B | ||
| 82 | |||
| 83 | // Modifiers for alternate modes | ||
| 84 | |||
| 85 | Mask = 63 // Mask to get unmodified ordinary operator | ||
| 86 | Rtl = 64 // bit to indicate that we're reverse scanning. | ||
| 87 | Back = 128 // bit to indicate that we're backtracking. | ||
| 88 | Back2 = 256 // bit to indicate that we're backtracking on a second branch. | ||
| 89 | Ci = 512 // bit to indicate that we're case-insensitive. | ||
| 90 | ) | ||
| 91 | |||
| 92 | type Code struct { | ||
| 93 | Codes []int // the code | ||
| 94 | Strings [][]rune // string table | ||
| 95 | Sets []*CharSet //character set table | ||
| 96 | TrackCount int // how many instructions use backtracking | ||
| 97 | Caps map[int]int // mapping of user group numbers -> impl group slots | ||
| 98 | Capsize int // number of impl group slots | ||
| 99 | FcPrefix *Prefix // the set of candidate first characters (may be null) | ||
| 100 | BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null) | ||
| 101 | Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc) | ||
| 102 | RightToLeft bool // true if right to left | ||
| 103 | } | ||
| 104 | |||
| 105 | func opcodeBacktracks(op InstOp) bool { | ||
| 106 | op &= Mask | ||
| 107 | |||
| 108 | switch op { | ||
| 109 | case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark, | ||
| 110 | Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump, | ||
| 111 | Forejump, Goto: | ||
| 112 | return true | ||
| 113 | |||
| 114 | default: | ||
| 115 | return false | ||
| 116 | } | ||
| 117 | } | ||
| 118 | |||
| 119 | func opcodeSize(op InstOp) int { | ||
| 120 | op &= Mask | ||
| 121 | |||
| 122 | switch op { | ||
| 123 | case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ, | ||
| 124 | End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop: | ||
| 125 | return 1 | ||
| 126 | |||
| 127 | case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark, | ||
| 128 | Prune, Set: | ||
| 129 | return 2 | ||
| 130 | |||
| 131 | case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, | ||
| 132 | Setlazy, Setrep, Setloop: | ||
| 133 | return 3 | ||
| 134 | |||
| 135 | default: | ||
| 136 | panic(fmt.Errorf("Unexpected op code: %v", op)) | ||
| 137 | } | ||
| 138 | } | ||
| 139 | |||
| 140 | var codeStr = []string{ | ||
| 141 | "Onerep", "Notonerep", "Setrep", | ||
| 142 | "Oneloop", "Notoneloop", "Setloop", | ||
| 143 | "Onelazy", "Notonelazy", "Setlazy", | ||
| 144 | "One", "Notone", "Set", | ||
| 145 | "Multi", "Ref", | ||
| 146 | "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End", | ||
| 147 | "Nothing", | ||
| 148 | "Lazybranch", "Branchmark", "Lazybranchmark", | ||
| 149 | "Nullcount", "Setcount", "Branchcount", "Lazybranchcount", | ||
| 150 | "Nullmark", "Setmark", "Capturemark", "Getmark", | ||
| 151 | "Setjump", "Backjump", "Forejump", "Testref", "Goto", | ||
| 152 | "Prune", "Stop", | ||
| 153 | "ECMABoundary", "NonECMABoundary", | ||
| 154 | } | ||
| 155 | |||
| 156 | func operatorDescription(op InstOp) string { | ||
| 157 | desc := codeStr[op&Mask] | ||
| 158 | if (op & Ci) != 0 { | ||
| 159 | desc += "-Ci" | ||
| 160 | } | ||
| 161 | if (op & Rtl) != 0 { | ||
| 162 | desc += "-Rtl" | ||
| 163 | } | ||
| 164 | if (op & Back) != 0 { | ||
| 165 | desc += "-Back" | ||
| 166 | } | ||
| 167 | if (op & Back2) != 0 { | ||
| 168 | desc += "-Back2" | ||
| 169 | } | ||
| 170 | |||
| 171 | return desc | ||
| 172 | } | ||
| 173 | |||
| 174 | // OpcodeDescription is a humman readable string of the specific offset | ||
| 175 | func (c *Code) OpcodeDescription(offset int) string { | ||
| 176 | buf := &bytes.Buffer{} | ||
| 177 | |||
| 178 | op := InstOp(c.Codes[offset]) | ||
| 179 | fmt.Fprintf(buf, "%06d ", offset) | ||
| 180 | |||
| 181 | if opcodeBacktracks(op & Mask) { | ||
| 182 | buf.WriteString("*") | ||
| 183 | } else { | ||
| 184 | buf.WriteString(" ") | ||
| 185 | } | ||
| 186 | buf.WriteString(operatorDescription(op)) | ||
| 187 | buf.WriteString("(") | ||
| 188 | op &= Mask | ||
| 189 | |||
| 190 | switch op { | ||
| 191 | case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy: | ||
| 192 | buf.WriteString("Ch = ") | ||
| 193 | buf.WriteString(CharDescription(rune(c.Codes[offset+1]))) | ||
| 194 | |||
| 195 | case Set, Setrep, Setloop, Setlazy: | ||
| 196 | buf.WriteString("Set = ") | ||
| 197 | buf.WriteString(c.Sets[c.Codes[offset+1]].String()) | ||
| 198 | |||
| 199 | case Multi: | ||
| 200 | fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]])) | ||
| 201 | |||
| 202 | case Ref, Testref: | ||
| 203 | fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) | ||
| 204 | |||
| 205 | case Capturemark: | ||
| 206 | fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) | ||
| 207 | if c.Codes[offset+2] != -1 { | ||
| 208 | fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2]) | ||
| 209 | } | ||
| 210 | |||
| 211 | case Nullcount, Setcount: | ||
| 212 | fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1]) | ||
| 213 | |||
| 214 | case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount: | ||
| 215 | fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1]) | ||
| 216 | } | ||
| 217 | |||
| 218 | switch op { | ||
| 219 | case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy: | ||
| 220 | buf.WriteString(", Rep = ") | ||
| 221 | if c.Codes[offset+2] == math.MaxInt32 { | ||
| 222 | buf.WriteString("inf") | ||
| 223 | } else { | ||
| 224 | fmt.Fprintf(buf, "%d", c.Codes[offset+2]) | ||
| 225 | } | ||
| 226 | |||
| 227 | case Branchcount, Lazybranchcount: | ||
| 228 | buf.WriteString(", Limit = ") | ||
| 229 | if c.Codes[offset+2] == math.MaxInt32 { | ||
| 230 | buf.WriteString("inf") | ||
| 231 | } else { | ||
| 232 | fmt.Fprintf(buf, "%d", c.Codes[offset+2]) | ||
| 233 | } | ||
| 234 | |||
| 235 | } | ||
| 236 | |||
| 237 | buf.WriteString(")") | ||
| 238 | |||
| 239 | return buf.String() | ||
| 240 | } | ||
| 241 | |||
| 242 | func (c *Code) Dump() string { | ||
| 243 | buf := &bytes.Buffer{} | ||
| 244 | |||
| 245 | if c.RightToLeft { | ||
| 246 | fmt.Fprintln(buf, "Direction: right-to-left") | ||
| 247 | } else { | ||
| 248 | fmt.Fprintln(buf, "Direction: left-to-right") | ||
| 249 | } | ||
| 250 | if c.FcPrefix == nil { | ||
| 251 | fmt.Fprintln(buf, "Firstchars: n/a") | ||
| 252 | } else { | ||
| 253 | fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String()) | ||
| 254 | } | ||
| 255 | |||
| 256 | if c.BmPrefix == nil { | ||
| 257 | fmt.Fprintln(buf, "Prefix: n/a") | ||
| 258 | } else { | ||
| 259 | fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String())) | ||
| 260 | } | ||
| 261 | |||
| 262 | fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors) | ||
| 263 | fmt.Fprintln(buf) | ||
| 264 | |||
| 265 | if c.BmPrefix != nil { | ||
| 266 | fmt.Fprintln(buf, "BoyerMoore:") | ||
| 267 | fmt.Fprintln(buf, c.BmPrefix.Dump(" ")) | ||
| 268 | } | ||
| 269 | for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) { | ||
| 270 | fmt.Fprintln(buf, c.OpcodeDescription(i)) | ||
| 271 | } | ||
| 272 | |||
| 273 | return buf.String() | ||
| 274 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/escape.go b/vendor/github.com/dlclark/regexp2/syntax/escape.go new file mode 100644 index 0000000..609df10 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/escape.go | |||
| @@ -0,0 +1,94 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "strconv" | ||
| 6 | "strings" | ||
| 7 | "unicode" | ||
| 8 | ) | ||
| 9 | |||
| 10 | func Escape(input string) string { | ||
| 11 | b := &bytes.Buffer{} | ||
| 12 | for _, r := range input { | ||
| 13 | escape(b, r, false) | ||
| 14 | } | ||
| 15 | return b.String() | ||
| 16 | } | ||
| 17 | |||
| 18 | const meta = `\.+*?()|[]{}^$# ` | ||
| 19 | |||
| 20 | func escape(b *bytes.Buffer, r rune, force bool) { | ||
| 21 | if unicode.IsPrint(r) { | ||
| 22 | if strings.IndexRune(meta, r) >= 0 || force { | ||
| 23 | b.WriteRune('\\') | ||
| 24 | } | ||
| 25 | b.WriteRune(r) | ||
| 26 | return | ||
| 27 | } | ||
| 28 | |||
| 29 | switch r { | ||
| 30 | case '\a': | ||
| 31 | b.WriteString(`\a`) | ||
| 32 | case '\f': | ||
| 33 | b.WriteString(`\f`) | ||
| 34 | case '\n': | ||
| 35 | b.WriteString(`\n`) | ||
| 36 | case '\r': | ||
| 37 | b.WriteString(`\r`) | ||
| 38 | case '\t': | ||
| 39 | b.WriteString(`\t`) | ||
| 40 | case '\v': | ||
| 41 | b.WriteString(`\v`) | ||
| 42 | default: | ||
| 43 | if r < 0x100 { | ||
| 44 | b.WriteString(`\x`) | ||
| 45 | s := strconv.FormatInt(int64(r), 16) | ||
| 46 | if len(s) == 1 { | ||
| 47 | b.WriteRune('0') | ||
| 48 | } | ||
| 49 | b.WriteString(s) | ||
| 50 | break | ||
| 51 | } | ||
| 52 | b.WriteString(`\u`) | ||
| 53 | b.WriteString(strconv.FormatInt(int64(r), 16)) | ||
| 54 | } | ||
| 55 | } | ||
| 56 | |||
| 57 | func Unescape(input string) (string, error) { | ||
| 58 | idx := strings.IndexRune(input, '\\') | ||
| 59 | // no slashes means no unescape needed | ||
| 60 | if idx == -1 { | ||
| 61 | return input, nil | ||
| 62 | } | ||
| 63 | |||
| 64 | buf := bytes.NewBufferString(input[:idx]) | ||
| 65 | // get the runes for the rest of the string -- we're going full parser scan on this | ||
| 66 | |||
| 67 | p := parser{} | ||
| 68 | p.setPattern(input[idx+1:]) | ||
| 69 | for { | ||
| 70 | if p.rightMost() { | ||
| 71 | return "", p.getErr(ErrIllegalEndEscape) | ||
| 72 | } | ||
| 73 | r, err := p.scanCharEscape() | ||
| 74 | if err != nil { | ||
| 75 | return "", err | ||
| 76 | } | ||
| 77 | buf.WriteRune(r) | ||
| 78 | // are we done? | ||
| 79 | if p.rightMost() { | ||
| 80 | return buf.String(), nil | ||
| 81 | } | ||
| 82 | |||
| 83 | r = p.moveRightGetChar() | ||
| 84 | for r != '\\' { | ||
| 85 | buf.WriteRune(r) | ||
| 86 | if p.rightMost() { | ||
| 87 | // we're done, no more slashes | ||
| 88 | return buf.String(), nil | ||
| 89 | } | ||
| 90 | // keep scanning until we get another slash | ||
| 91 | r = p.moveRightGetChar() | ||
| 92 | } | ||
| 93 | } | ||
| 94 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/fuzz.go b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go new file mode 100644 index 0000000..ee86386 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/fuzz.go | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | // +build gofuzz | ||
| 2 | |||
| 3 | package syntax | ||
| 4 | |||
| 5 | // Fuzz is the input point for go-fuzz | ||
| 6 | func Fuzz(data []byte) int { | ||
| 7 | sdata := string(data) | ||
| 8 | tree, err := Parse(sdata, RegexOptions(0)) | ||
| 9 | if err != nil { | ||
| 10 | return 0 | ||
| 11 | } | ||
| 12 | |||
| 13 | // translate it to code | ||
| 14 | _, err = Write(tree) | ||
| 15 | if err != nil { | ||
| 16 | panic(err) | ||
| 17 | } | ||
| 18 | |||
| 19 | return 1 | ||
| 20 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/parser.go b/vendor/github.com/dlclark/regexp2/syntax/parser.go new file mode 100644 index 0000000..9dc6e31 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/parser.go | |||
| @@ -0,0 +1,2251 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "fmt" | ||
| 5 | "math" | ||
| 6 | "os" | ||
| 7 | "sort" | ||
| 8 | "strconv" | ||
| 9 | "unicode" | ||
| 10 | ) | ||
| 11 | |||
| 12 | type RegexOptions int32 | ||
| 13 | |||
| 14 | const ( | ||
| 15 | IgnoreCase RegexOptions = 0x0001 // "i" | ||
| 16 | Multiline = 0x0002 // "m" | ||
| 17 | ExplicitCapture = 0x0004 // "n" | ||
| 18 | Compiled = 0x0008 // "c" | ||
| 19 | Singleline = 0x0010 // "s" | ||
| 20 | IgnorePatternWhitespace = 0x0020 // "x" | ||
| 21 | RightToLeft = 0x0040 // "r" | ||
| 22 | Debug = 0x0080 // "d" | ||
| 23 | ECMAScript = 0x0100 // "e" | ||
| 24 | RE2 = 0x0200 // RE2 compat mode | ||
| 25 | Unicode = 0x0400 // "u" | ||
| 26 | ) | ||
| 27 | |||
| 28 | func optionFromCode(ch rune) RegexOptions { | ||
| 29 | // case-insensitive | ||
| 30 | switch ch { | ||
| 31 | case 'i', 'I': | ||
| 32 | return IgnoreCase | ||
| 33 | case 'r', 'R': | ||
| 34 | return RightToLeft | ||
| 35 | case 'm', 'M': | ||
| 36 | return Multiline | ||
| 37 | case 'n', 'N': | ||
| 38 | return ExplicitCapture | ||
| 39 | case 's', 'S': | ||
| 40 | return Singleline | ||
| 41 | case 'x', 'X': | ||
| 42 | return IgnorePatternWhitespace | ||
| 43 | case 'd', 'D': | ||
| 44 | return Debug | ||
| 45 | case 'e', 'E': | ||
| 46 | return ECMAScript | ||
| 47 | case 'u', 'U': | ||
| 48 | return Unicode | ||
| 49 | default: | ||
| 50 | return 0 | ||
| 51 | } | ||
| 52 | } | ||
| 53 | |||
| 54 | // An Error describes a failure to parse a regular expression | ||
| 55 | // and gives the offending expression. | ||
| 56 | type Error struct { | ||
| 57 | Code ErrorCode | ||
| 58 | Expr string | ||
| 59 | Args []interface{} | ||
| 60 | } | ||
| 61 | |||
| 62 | func (e *Error) Error() string { | ||
| 63 | if len(e.Args) == 0 { | ||
| 64 | return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`" | ||
| 65 | } | ||
| 66 | return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`" | ||
| 67 | } | ||
| 68 | |||
| 69 | // An ErrorCode describes a failure to parse a regular expression. | ||
| 70 | type ErrorCode string | ||
| 71 | |||
| 72 | const ( | ||
| 73 | // internal issue | ||
| 74 | ErrInternalError ErrorCode = "regexp/syntax: internal error" | ||
| 75 | // Parser errors | ||
| 76 | ErrUnterminatedComment = "unterminated comment" | ||
| 77 | ErrInvalidCharRange = "invalid character class range" | ||
| 78 | ErrInvalidRepeatSize = "invalid repeat count" | ||
| 79 | ErrInvalidUTF8 = "invalid UTF-8" | ||
| 80 | ErrCaptureGroupOutOfRange = "capture group number out of range" | ||
| 81 | ErrUnexpectedParen = "unexpected )" | ||
| 82 | ErrMissingParen = "missing closing )" | ||
| 83 | ErrMissingBrace = "missing closing }" | ||
| 84 | ErrInvalidRepeatOp = "invalid nested repetition operator" | ||
| 85 | ErrMissingRepeatArgument = "missing argument to repetition operator" | ||
| 86 | ErrConditionalExpression = "illegal conditional (?(...)) expression" | ||
| 87 | ErrTooManyAlternates = "too many | in (?()|)" | ||
| 88 | ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" | ||
| 89 | ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" | ||
| 90 | ErrCapNumNotZero = "capture number cannot be zero" | ||
| 91 | ErrUndefinedBackRef = "reference to undefined group number %v" | ||
| 92 | ErrUndefinedNameRef = "reference to undefined group name %v" | ||
| 93 | ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" | ||
| 94 | ErrAlternationCantHaveComment = "alternation conditions cannot be comments" | ||
| 95 | ErrMalformedReference = "(?(%v) ) malformed" | ||
| 96 | ErrUndefinedReference = "(?(%v) ) reference to undefined group" | ||
| 97 | ErrIllegalEndEscape = "illegal \\ at end of pattern" | ||
| 98 | ErrMalformedSlashP = "malformed \\p{X} character escape" | ||
| 99 | ErrIncompleteSlashP = "incomplete \\p{X} character escape" | ||
| 100 | ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" | ||
| 101 | ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" | ||
| 102 | ErrMissingControl = "missing control character" | ||
| 103 | ErrUnrecognizedControl = "unrecognized control character" | ||
| 104 | ErrTooFewHex = "insufficient hexadecimal digits" | ||
| 105 | ErrInvalidHex = "hex values may not be larger than 0x10FFFF" | ||
| 106 | ErrMalformedNameRef = "malformed \\k<...> named back reference" | ||
| 107 | ErrBadClassInCharRange = "cannot include class \\%v in character range" | ||
| 108 | ErrUnterminatedBracket = "unterminated [] set" | ||
| 109 | ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" | ||
| 110 | ErrReversedCharRange = "[%c-%c] range in reverse order" | ||
| 111 | ) | ||
| 112 | |||
| 113 | func (e ErrorCode) String() string { | ||
| 114 | return string(e) | ||
| 115 | } | ||
| 116 | |||
| 117 | type parser struct { | ||
| 118 | stack *regexNode | ||
| 119 | group *regexNode | ||
| 120 | alternation *regexNode | ||
| 121 | concatenation *regexNode | ||
| 122 | unit *regexNode | ||
| 123 | |||
| 124 | patternRaw string | ||
| 125 | pattern []rune | ||
| 126 | |||
| 127 | currentPos int | ||
| 128 | specialCase *unicode.SpecialCase | ||
| 129 | |||
| 130 | autocap int | ||
| 131 | capcount int | ||
| 132 | captop int | ||
| 133 | capsize int | ||
| 134 | |||
| 135 | caps map[int]int | ||
| 136 | capnames map[string]int | ||
| 137 | |||
| 138 | capnumlist []int | ||
| 139 | capnamelist []string | ||
| 140 | |||
| 141 | options RegexOptions | ||
| 142 | optionsStack []RegexOptions | ||
| 143 | ignoreNextParen bool | ||
| 144 | } | ||
| 145 | |||
| 146 | const ( | ||
| 147 | maxValueDiv10 int = math.MaxInt32 / 10 | ||
| 148 | maxValueMod10 = math.MaxInt32 % 10 | ||
| 149 | ) | ||
| 150 | |||
| 151 | // Parse converts a regex string into a parse tree | ||
| 152 | func Parse(re string, op RegexOptions) (*RegexTree, error) { | ||
| 153 | p := parser{ | ||
| 154 | options: op, | ||
| 155 | caps: make(map[int]int), | ||
| 156 | } | ||
| 157 | p.setPattern(re) | ||
| 158 | |||
| 159 | if err := p.countCaptures(); err != nil { | ||
| 160 | return nil, err | ||
| 161 | } | ||
| 162 | |||
| 163 | p.reset(op) | ||
| 164 | root, err := p.scanRegex() | ||
| 165 | |||
| 166 | if err != nil { | ||
| 167 | return nil, err | ||
| 168 | } | ||
| 169 | tree := &RegexTree{ | ||
| 170 | root: root, | ||
| 171 | caps: p.caps, | ||
| 172 | capnumlist: p.capnumlist, | ||
| 173 | captop: p.captop, | ||
| 174 | Capnames: p.capnames, | ||
| 175 | Caplist: p.capnamelist, | ||
| 176 | options: op, | ||
| 177 | } | ||
| 178 | |||
| 179 | if tree.options&Debug > 0 { | ||
| 180 | os.Stdout.WriteString(tree.Dump()) | ||
| 181 | } | ||
| 182 | |||
| 183 | return tree, nil | ||
| 184 | } | ||
| 185 | |||
| 186 | func (p *parser) setPattern(pattern string) { | ||
| 187 | p.patternRaw = pattern | ||
| 188 | p.pattern = make([]rune, 0, len(pattern)) | ||
| 189 | |||
| 190 | //populate our rune array to handle utf8 encoding | ||
| 191 | for _, r := range pattern { | ||
| 192 | p.pattern = append(p.pattern, r) | ||
| 193 | } | ||
| 194 | } | ||
| 195 | func (p *parser) getErr(code ErrorCode, args ...interface{}) error { | ||
| 196 | return &Error{Code: code, Expr: p.patternRaw, Args: args} | ||
| 197 | } | ||
| 198 | |||
| 199 | func (p *parser) noteCaptureSlot(i, pos int) { | ||
| 200 | if _, ok := p.caps[i]; !ok { | ||
| 201 | // the rhs of the hashtable isn't used in the parser | ||
| 202 | p.caps[i] = pos | ||
| 203 | p.capcount++ | ||
| 204 | |||
| 205 | if p.captop <= i { | ||
| 206 | if i == math.MaxInt32 { | ||
| 207 | p.captop = i | ||
| 208 | } else { | ||
| 209 | p.captop = i + 1 | ||
| 210 | } | ||
| 211 | } | ||
| 212 | } | ||
| 213 | } | ||
| 214 | |||
| 215 | func (p *parser) noteCaptureName(name string, pos int) { | ||
| 216 | if p.capnames == nil { | ||
| 217 | p.capnames = make(map[string]int) | ||
| 218 | } | ||
| 219 | |||
| 220 | if _, ok := p.capnames[name]; !ok { | ||
| 221 | p.capnames[name] = pos | ||
| 222 | p.capnamelist = append(p.capnamelist, name) | ||
| 223 | } | ||
| 224 | } | ||
| 225 | |||
| 226 | func (p *parser) assignNameSlots() { | ||
| 227 | if p.capnames != nil { | ||
| 228 | for _, name := range p.capnamelist { | ||
| 229 | for p.isCaptureSlot(p.autocap) { | ||
| 230 | p.autocap++ | ||
| 231 | } | ||
| 232 | pos := p.capnames[name] | ||
| 233 | p.capnames[name] = p.autocap | ||
| 234 | p.noteCaptureSlot(p.autocap, pos) | ||
| 235 | |||
| 236 | p.autocap++ | ||
| 237 | } | ||
| 238 | } | ||
| 239 | |||
| 240 | // if the caps array has at least one gap, construct the list of used slots | ||
| 241 | if p.capcount < p.captop { | ||
| 242 | p.capnumlist = make([]int, p.capcount) | ||
| 243 | i := 0 | ||
| 244 | |||
| 245 | for k := range p.caps { | ||
| 246 | p.capnumlist[i] = k | ||
| 247 | i++ | ||
| 248 | } | ||
| 249 | |||
| 250 | sort.Ints(p.capnumlist) | ||
| 251 | } | ||
| 252 | |||
| 253 | // merge capsnumlist into capnamelist | ||
| 254 | if p.capnames != nil || p.capnumlist != nil { | ||
| 255 | var oldcapnamelist []string | ||
| 256 | var next int | ||
| 257 | var k int | ||
| 258 | |||
| 259 | if p.capnames == nil { | ||
| 260 | oldcapnamelist = nil | ||
| 261 | p.capnames = make(map[string]int) | ||
| 262 | p.capnamelist = []string{} | ||
| 263 | next = -1 | ||
| 264 | } else { | ||
| 265 | oldcapnamelist = p.capnamelist | ||
| 266 | p.capnamelist = []string{} | ||
| 267 | next = p.capnames[oldcapnamelist[0]] | ||
| 268 | } | ||
| 269 | |||
| 270 | for i := 0; i < p.capcount; i++ { | ||
| 271 | j := i | ||
| 272 | if p.capnumlist != nil { | ||
| 273 | j = p.capnumlist[i] | ||
| 274 | } | ||
| 275 | |||
| 276 | if next == j { | ||
| 277 | p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) | ||
| 278 | k++ | ||
| 279 | |||
| 280 | if k == len(oldcapnamelist) { | ||
| 281 | next = -1 | ||
| 282 | } else { | ||
| 283 | next = p.capnames[oldcapnamelist[k]] | ||
| 284 | } | ||
| 285 | |||
| 286 | } else { | ||
| 287 | //feature: culture? | ||
| 288 | str := strconv.Itoa(j) | ||
| 289 | p.capnamelist = append(p.capnamelist, str) | ||
| 290 | p.capnames[str] = j | ||
| 291 | } | ||
| 292 | } | ||
| 293 | } | ||
| 294 | } | ||
| 295 | |||
| 296 | func (p *parser) consumeAutocap() int { | ||
| 297 | r := p.autocap | ||
| 298 | p.autocap++ | ||
| 299 | return r | ||
| 300 | } | ||
| 301 | |||
| 302 | // CountCaptures is a prescanner for deducing the slots used for | ||
| 303 | // captures by doing a partial tokenization of the pattern. | ||
| 304 | func (p *parser) countCaptures() error { | ||
| 305 | var ch rune | ||
| 306 | |||
| 307 | p.noteCaptureSlot(0, 0) | ||
| 308 | |||
| 309 | p.autocap = 1 | ||
| 310 | |||
| 311 | for p.charsRight() > 0 { | ||
| 312 | pos := p.textpos() | ||
| 313 | ch = p.moveRightGetChar() | ||
| 314 | switch ch { | ||
| 315 | case '\\': | ||
| 316 | if p.charsRight() > 0 { | ||
| 317 | p.scanBackslash(true) | ||
| 318 | } | ||
| 319 | |||
| 320 | case '#': | ||
| 321 | if p.useOptionX() { | ||
| 322 | p.moveLeft() | ||
| 323 | p.scanBlank() | ||
| 324 | } | ||
| 325 | |||
| 326 | case '[': | ||
| 327 | p.scanCharSet(false, true) | ||
| 328 | |||
| 329 | case ')': | ||
| 330 | if !p.emptyOptionsStack() { | ||
| 331 | p.popOptions() | ||
| 332 | } | ||
| 333 | |||
| 334 | case '(': | ||
| 335 | if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' { | ||
| 336 | p.moveLeft() | ||
| 337 | p.scanBlank() | ||
| 338 | } else { | ||
| 339 | p.pushOptions() | ||
| 340 | if p.charsRight() > 0 && p.rightChar(0) == '?' { | ||
| 341 | // we have (?... | ||
| 342 | p.moveRight(1) | ||
| 343 | |||
| 344 | if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') { | ||
| 345 | // named group: (?<... or (?'... | ||
| 346 | |||
| 347 | p.moveRight(1) | ||
| 348 | ch = p.rightChar(0) | ||
| 349 | |||
| 350 | if ch != '0' && IsWordChar(ch) { | ||
| 351 | if ch >= '1' && ch <= '9' { | ||
| 352 | dec, err := p.scanDecimal() | ||
| 353 | if err != nil { | ||
| 354 | return err | ||
| 355 | } | ||
| 356 | p.noteCaptureSlot(dec, pos) | ||
| 357 | } else { | ||
| 358 | p.noteCaptureName(p.scanCapname(), pos) | ||
| 359 | } | ||
| 360 | } | ||
| 361 | } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') { | ||
| 362 | // RE2-compat (?P<) | ||
| 363 | p.moveRight(2) | ||
| 364 | ch = p.rightChar(0) | ||
| 365 | if IsWordChar(ch) { | ||
| 366 | p.noteCaptureName(p.scanCapname(), pos) | ||
| 367 | } | ||
| 368 | |||
| 369 | } else { | ||
| 370 | // (?... | ||
| 371 | |||
| 372 | // get the options if it's an option construct (?cimsx-cimsx...) | ||
| 373 | p.scanOptions() | ||
| 374 | |||
| 375 | if p.charsRight() > 0 { | ||
| 376 | if p.rightChar(0) == ')' { | ||
| 377 | // (?cimsx-cimsx) | ||
| 378 | p.moveRight(1) | ||
| 379 | p.popKeepOptions() | ||
| 380 | } else if p.rightChar(0) == '(' { | ||
| 381 | // alternation construct: (?(foo)yes|no) | ||
| 382 | // ignore the next paren so we don't capture the condition | ||
| 383 | p.ignoreNextParen = true | ||
| 384 | |||
| 385 | // break from here so we don't reset ignoreNextParen | ||
| 386 | continue | ||
| 387 | } | ||
| 388 | } | ||
| 389 | } | ||
| 390 | } else { | ||
| 391 | if !p.useOptionN() && !p.ignoreNextParen { | ||
| 392 | p.noteCaptureSlot(p.consumeAutocap(), pos) | ||
| 393 | } | ||
| 394 | } | ||
| 395 | } | ||
| 396 | |||
| 397 | p.ignoreNextParen = false | ||
| 398 | |||
| 399 | } | ||
| 400 | } | ||
| 401 | |||
| 402 | p.assignNameSlots() | ||
| 403 | return nil | ||
| 404 | } | ||
| 405 | |||
| 406 | func (p *parser) reset(topopts RegexOptions) { | ||
| 407 | p.currentPos = 0 | ||
| 408 | p.autocap = 1 | ||
| 409 | p.ignoreNextParen = false | ||
| 410 | |||
| 411 | if len(p.optionsStack) > 0 { | ||
| 412 | p.optionsStack = p.optionsStack[:0] | ||
| 413 | } | ||
| 414 | |||
| 415 | p.options = topopts | ||
| 416 | p.stack = nil | ||
| 417 | } | ||
| 418 | |||
| 419 | func (p *parser) scanRegex() (*regexNode, error) { | ||
| 420 | ch := '@' // nonspecial ch, means at beginning | ||
| 421 | isQuant := false | ||
| 422 | |||
| 423 | p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1)) | ||
| 424 | |||
| 425 | for p.charsRight() > 0 { | ||
| 426 | wasPrevQuantifier := isQuant | ||
| 427 | isQuant = false | ||
| 428 | |||
| 429 | if err := p.scanBlank(); err != nil { | ||
| 430 | return nil, err | ||
| 431 | } | ||
| 432 | |||
| 433 | startpos := p.textpos() | ||
| 434 | |||
| 435 | // move past all of the normal characters. We'll stop when we hit some kind of control character, | ||
| 436 | // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. | ||
| 437 | if p.useOptionX() { | ||
| 438 | for p.charsRight() > 0 { | ||
| 439 | ch = p.rightChar(0) | ||
| 440 | //UGLY: clean up, this is ugly | ||
| 441 | if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) { | ||
| 442 | break | ||
| 443 | } | ||
| 444 | p.moveRight(1) | ||
| 445 | } | ||
| 446 | } else { | ||
| 447 | for p.charsRight() > 0 { | ||
| 448 | ch = p.rightChar(0) | ||
| 449 | if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) { | ||
| 450 | break | ||
| 451 | } | ||
| 452 | p.moveRight(1) | ||
| 453 | } | ||
| 454 | } | ||
| 455 | |||
| 456 | endpos := p.textpos() | ||
| 457 | |||
| 458 | p.scanBlank() | ||
| 459 | |||
| 460 | if p.charsRight() == 0 { | ||
| 461 | ch = '!' // nonspecial, means at end | ||
| 462 | } else if ch = p.rightChar(0); isSpecial(ch) { | ||
| 463 | isQuant = isQuantifier(ch) | ||
| 464 | p.moveRight(1) | ||
| 465 | } else { | ||
| 466 | ch = ' ' // nonspecial, means at ordinary char | ||
| 467 | } | ||
| 468 | |||
| 469 | if startpos < endpos { | ||
| 470 | cchUnquantified := endpos - startpos | ||
| 471 | if isQuant { | ||
| 472 | cchUnquantified-- | ||
| 473 | } | ||
| 474 | wasPrevQuantifier = false | ||
| 475 | |||
| 476 | if cchUnquantified > 0 { | ||
| 477 | p.addToConcatenate(startpos, cchUnquantified, false) | ||
| 478 | } | ||
| 479 | |||
| 480 | if isQuant { | ||
| 481 | p.addUnitOne(p.charAt(endpos - 1)) | ||
| 482 | } | ||
| 483 | } | ||
| 484 | |||
| 485 | switch ch { | ||
| 486 | case '!': | ||
| 487 | goto BreakOuterScan | ||
| 488 | |||
| 489 | case ' ': | ||
| 490 | goto ContinueOuterScan | ||
| 491 | |||
| 492 | case '[': | ||
| 493 | cc, err := p.scanCharSet(p.useOptionI(), false) | ||
| 494 | if err != nil { | ||
| 495 | return nil, err | ||
| 496 | } | ||
| 497 | p.addUnitSet(cc) | ||
| 498 | |||
| 499 | case '(': | ||
| 500 | p.pushOptions() | ||
| 501 | |||
| 502 | if grouper, err := p.scanGroupOpen(); err != nil { | ||
| 503 | return nil, err | ||
| 504 | } else if grouper == nil { | ||
| 505 | p.popKeepOptions() | ||
| 506 | } else { | ||
| 507 | p.pushGroup() | ||
| 508 | p.startGroup(grouper) | ||
| 509 | } | ||
| 510 | |||
| 511 | continue | ||
| 512 | |||
| 513 | case '|': | ||
| 514 | p.addAlternate() | ||
| 515 | goto ContinueOuterScan | ||
| 516 | |||
| 517 | case ')': | ||
| 518 | if p.emptyStack() { | ||
| 519 | return nil, p.getErr(ErrUnexpectedParen) | ||
| 520 | } | ||
| 521 | |||
| 522 | if err := p.addGroup(); err != nil { | ||
| 523 | return nil, err | ||
| 524 | } | ||
| 525 | if err := p.popGroup(); err != nil { | ||
| 526 | return nil, err | ||
| 527 | } | ||
| 528 | p.popOptions() | ||
| 529 | |||
| 530 | if p.unit == nil { | ||
| 531 | goto ContinueOuterScan | ||
| 532 | } | ||
| 533 | |||
| 534 | case '\\': | ||
| 535 | n, err := p.scanBackslash(false) | ||
| 536 | if err != nil { | ||
| 537 | return nil, err | ||
| 538 | } | ||
| 539 | p.addUnitNode(n) | ||
| 540 | |||
| 541 | case '^': | ||
| 542 | if p.useOptionM() { | ||
| 543 | p.addUnitType(ntBol) | ||
| 544 | } else { | ||
| 545 | p.addUnitType(ntBeginning) | ||
| 546 | } | ||
| 547 | |||
| 548 | case '$': | ||
| 549 | if p.useOptionM() { | ||
| 550 | p.addUnitType(ntEol) | ||
| 551 | } else { | ||
| 552 | p.addUnitType(ntEndZ) | ||
| 553 | } | ||
| 554 | |||
| 555 | case '.': | ||
| 556 | if p.useOptionE() { | ||
| 557 | p.addUnitSet(ECMAAnyClass()) | ||
| 558 | } else if p.useOptionS() { | ||
| 559 | p.addUnitSet(AnyClass()) | ||
| 560 | } else { | ||
| 561 | p.addUnitNotone('\n') | ||
| 562 | } | ||
| 563 | |||
| 564 | case '{', '*', '+', '?': | ||
| 565 | if p.unit == nil { | ||
| 566 | if wasPrevQuantifier { | ||
| 567 | return nil, p.getErr(ErrInvalidRepeatOp) | ||
| 568 | } else { | ||
| 569 | return nil, p.getErr(ErrMissingRepeatArgument) | ||
| 570 | } | ||
| 571 | } | ||
| 572 | p.moveLeft() | ||
| 573 | |||
| 574 | default: | ||
| 575 | return nil, p.getErr(ErrInternalError) | ||
| 576 | } | ||
| 577 | |||
| 578 | if err := p.scanBlank(); err != nil { | ||
| 579 | return nil, err | ||
| 580 | } | ||
| 581 | |||
| 582 | if p.charsRight() > 0 { | ||
| 583 | isQuant = p.isTrueQuantifier() | ||
| 584 | } | ||
| 585 | if p.charsRight() == 0 || !isQuant { | ||
| 586 | //maintain odd C# assignment order -- not sure if required, could clean up? | ||
| 587 | p.addConcatenate() | ||
| 588 | goto ContinueOuterScan | ||
| 589 | } | ||
| 590 | |||
| 591 | ch = p.moveRightGetChar() | ||
| 592 | |||
| 593 | // Handle quantifiers | ||
| 594 | for p.unit != nil { | ||
| 595 | var min, max int | ||
| 596 | var lazy bool | ||
| 597 | |||
| 598 | switch ch { | ||
| 599 | case '*': | ||
| 600 | min = 0 | ||
| 601 | max = math.MaxInt32 | ||
| 602 | |||
| 603 | case '?': | ||
| 604 | min = 0 | ||
| 605 | max = 1 | ||
| 606 | |||
| 607 | case '+': | ||
| 608 | min = 1 | ||
| 609 | max = math.MaxInt32 | ||
| 610 | |||
| 611 | case '{': | ||
| 612 | { | ||
| 613 | var err error | ||
| 614 | startpos = p.textpos() | ||
| 615 | if min, err = p.scanDecimal(); err != nil { | ||
| 616 | return nil, err | ||
| 617 | } | ||
| 618 | max = min | ||
| 619 | if startpos < p.textpos() { | ||
| 620 | if p.charsRight() > 0 && p.rightChar(0) == ',' { | ||
| 621 | p.moveRight(1) | ||
| 622 | if p.charsRight() == 0 || p.rightChar(0) == '}' { | ||
| 623 | max = math.MaxInt32 | ||
| 624 | } else { | ||
| 625 | if max, err = p.scanDecimal(); err != nil { | ||
| 626 | return nil, err | ||
| 627 | } | ||
| 628 | } | ||
| 629 | } | ||
| 630 | } | ||
| 631 | |||
| 632 | if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' { | ||
| 633 | p.addConcatenate() | ||
| 634 | p.textto(startpos - 1) | ||
| 635 | goto ContinueOuterScan | ||
| 636 | } | ||
| 637 | } | ||
| 638 | |||
| 639 | default: | ||
| 640 | return nil, p.getErr(ErrInternalError) | ||
| 641 | } | ||
| 642 | |||
| 643 | if err := p.scanBlank(); err != nil { | ||
| 644 | return nil, err | ||
| 645 | } | ||
| 646 | |||
| 647 | if p.charsRight() == 0 || p.rightChar(0) != '?' { | ||
| 648 | lazy = false | ||
| 649 | } else { | ||
| 650 | p.moveRight(1) | ||
| 651 | lazy = true | ||
| 652 | } | ||
| 653 | |||
| 654 | if min > max { | ||
| 655 | return nil, p.getErr(ErrInvalidRepeatSize) | ||
| 656 | } | ||
| 657 | |||
| 658 | p.addConcatenate3(lazy, min, max) | ||
| 659 | } | ||
| 660 | |||
| 661 | ContinueOuterScan: | ||
| 662 | } | ||
| 663 | |||
| 664 | BreakOuterScan: | ||
| 665 | ; | ||
| 666 | |||
| 667 | if !p.emptyStack() { | ||
| 668 | return nil, p.getErr(ErrMissingParen) | ||
| 669 | } | ||
| 670 | |||
| 671 | if err := p.addGroup(); err != nil { | ||
| 672 | return nil, err | ||
| 673 | } | ||
| 674 | |||
| 675 | return p.unit, nil | ||
| 676 | |||
| 677 | } | ||
| 678 | |||
| 679 | /* | ||
| 680 | * Simple parsing for replacement patterns | ||
| 681 | */ | ||
| 682 | func (p *parser) scanReplacement() (*regexNode, error) { | ||
| 683 | var c, startpos int | ||
| 684 | |||
| 685 | p.concatenation = newRegexNode(ntConcatenate, p.options) | ||
| 686 | |||
| 687 | for { | ||
| 688 | c = p.charsRight() | ||
| 689 | if c == 0 { | ||
| 690 | break | ||
| 691 | } | ||
| 692 | |||
| 693 | startpos = p.textpos() | ||
| 694 | |||
| 695 | for c > 0 && p.rightChar(0) != '$' { | ||
| 696 | p.moveRight(1) | ||
| 697 | c-- | ||
| 698 | } | ||
| 699 | |||
| 700 | p.addToConcatenate(startpos, p.textpos()-startpos, true) | ||
| 701 | |||
| 702 | if c > 0 { | ||
| 703 | if p.moveRightGetChar() == '$' { | ||
| 704 | n, err := p.scanDollar() | ||
| 705 | if err != nil { | ||
| 706 | return nil, err | ||
| 707 | } | ||
| 708 | p.addUnitNode(n) | ||
| 709 | } | ||
| 710 | p.addConcatenate() | ||
| 711 | } | ||
| 712 | } | ||
| 713 | |||
| 714 | return p.concatenation, nil | ||
| 715 | } | ||
| 716 | |||
| 717 | /* | ||
| 718 | * Scans $ patterns recognized within replacement patterns | ||
| 719 | */ | ||
| 720 | func (p *parser) scanDollar() (*regexNode, error) { | ||
| 721 | if p.charsRight() == 0 { | ||
| 722 | return newRegexNodeCh(ntOne, p.options, '$'), nil | ||
| 723 | } | ||
| 724 | |||
| 725 | ch := p.rightChar(0) | ||
| 726 | angled := false | ||
| 727 | backpos := p.textpos() | ||
| 728 | lastEndPos := backpos | ||
| 729 | |||
| 730 | // Note angle | ||
| 731 | |||
| 732 | if ch == '{' && p.charsRight() > 1 { | ||
| 733 | angled = true | ||
| 734 | p.moveRight(1) | ||
| 735 | ch = p.rightChar(0) | ||
| 736 | } | ||
| 737 | |||
| 738 | // Try to parse backreference: \1 or \{1} or \{cap} | ||
| 739 | |||
| 740 | if ch >= '0' && ch <= '9' { | ||
| 741 | if !angled && p.useOptionE() { | ||
| 742 | capnum := -1 | ||
| 743 | newcapnum := int(ch - '0') | ||
| 744 | p.moveRight(1) | ||
| 745 | if p.isCaptureSlot(newcapnum) { | ||
| 746 | capnum = newcapnum | ||
| 747 | lastEndPos = p.textpos() | ||
| 748 | } | ||
| 749 | |||
| 750 | for p.charsRight() > 0 { | ||
| 751 | ch = p.rightChar(0) | ||
| 752 | if ch < '0' || ch > '9' { | ||
| 753 | break | ||
| 754 | } | ||
| 755 | digit := int(ch - '0') | ||
| 756 | if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) { | ||
| 757 | return nil, p.getErr(ErrCaptureGroupOutOfRange) | ||
| 758 | } | ||
| 759 | |||
| 760 | newcapnum = newcapnum*10 + digit | ||
| 761 | |||
| 762 | p.moveRight(1) | ||
| 763 | if p.isCaptureSlot(newcapnum) { | ||
| 764 | capnum = newcapnum | ||
| 765 | lastEndPos = p.textpos() | ||
| 766 | } | ||
| 767 | } | ||
| 768 | p.textto(lastEndPos) | ||
| 769 | if capnum >= 0 { | ||
| 770 | return newRegexNodeM(ntRef, p.options, capnum), nil | ||
| 771 | } | ||
| 772 | } else { | ||
| 773 | capnum, err := p.scanDecimal() | ||
| 774 | if err != nil { | ||
| 775 | return nil, err | ||
| 776 | } | ||
| 777 | if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' { | ||
| 778 | if p.isCaptureSlot(capnum) { | ||
| 779 | return newRegexNodeM(ntRef, p.options, capnum), nil | ||
| 780 | } | ||
| 781 | } | ||
| 782 | } | ||
| 783 | } else if angled && IsWordChar(ch) { | ||
| 784 | capname := p.scanCapname() | ||
| 785 | |||
| 786 | if p.charsRight() > 0 && p.moveRightGetChar() == '}' { | ||
| 787 | if p.isCaptureName(capname) { | ||
| 788 | return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil | ||
| 789 | } | ||
| 790 | } | ||
| 791 | } else if !angled { | ||
| 792 | capnum := 1 | ||
| 793 | |||
| 794 | switch ch { | ||
| 795 | case '$': | ||
| 796 | p.moveRight(1) | ||
| 797 | return newRegexNodeCh(ntOne, p.options, '$'), nil | ||
| 798 | case '&': | ||
| 799 | capnum = 0 | ||
| 800 | case '`': | ||
| 801 | capnum = replaceLeftPortion | ||
| 802 | case '\'': | ||
| 803 | capnum = replaceRightPortion | ||
| 804 | case '+': | ||
| 805 | capnum = replaceLastGroup | ||
| 806 | case '_': | ||
| 807 | capnum = replaceWholeString | ||
| 808 | } | ||
| 809 | |||
| 810 | if capnum != 1 { | ||
| 811 | p.moveRight(1) | ||
| 812 | return newRegexNodeM(ntRef, p.options, capnum), nil | ||
| 813 | } | ||
| 814 | } | ||
| 815 | |||
| 816 | // unrecognized $: literalize | ||
| 817 | |||
| 818 | p.textto(backpos) | ||
| 819 | return newRegexNodeCh(ntOne, p.options, '$'), nil | ||
| 820 | } | ||
| 821 | |||
| 822 | // scanGroupOpen scans chars following a '(' (not counting the '('), and returns | ||
| 823 | // a RegexNode for the type of group scanned, or nil if the group | ||
| 824 | // simply changed options (?cimsx-cimsx) or was a comment (#...). | ||
| 825 | func (p *parser) scanGroupOpen() (*regexNode, error) { | ||
| 826 | var ch rune | ||
| 827 | var nt nodeType | ||
| 828 | var err error | ||
| 829 | close := '>' | ||
| 830 | start := p.textpos() | ||
| 831 | |||
| 832 | // just return a RegexNode if we have: | ||
| 833 | // 1. "(" followed by nothing | ||
| 834 | // 2. "(x" where x != ? | ||
| 835 | // 3. "(?)" | ||
| 836 | if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) { | ||
| 837 | if p.useOptionN() || p.ignoreNextParen { | ||
| 838 | p.ignoreNextParen = false | ||
| 839 | return newRegexNode(ntGroup, p.options), nil | ||
| 840 | } | ||
| 841 | return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil | ||
| 842 | } | ||
| 843 | |||
| 844 | p.moveRight(1) | ||
| 845 | |||
| 846 | for { | ||
| 847 | if p.charsRight() == 0 { | ||
| 848 | break | ||
| 849 | } | ||
| 850 | |||
| 851 | switch ch = p.moveRightGetChar(); ch { | ||
| 852 | case ':': | ||
| 853 | nt = ntGroup | ||
| 854 | |||
| 855 | case '=': | ||
| 856 | p.options &= ^RightToLeft | ||
| 857 | nt = ntRequire | ||
| 858 | |||
| 859 | case '!': | ||
| 860 | p.options &= ^RightToLeft | ||
| 861 | nt = ntPrevent | ||
| 862 | |||
| 863 | case '>': | ||
| 864 | nt = ntGreedy | ||
| 865 | |||
| 866 | case '\'': | ||
| 867 | close = '\'' | ||
| 868 | fallthrough | ||
| 869 | |||
| 870 | case '<': | ||
| 871 | if p.charsRight() == 0 { | ||
| 872 | goto BreakRecognize | ||
| 873 | } | ||
| 874 | |||
| 875 | switch ch = p.moveRightGetChar(); ch { | ||
| 876 | case '=': | ||
| 877 | if close == '\'' { | ||
| 878 | goto BreakRecognize | ||
| 879 | } | ||
| 880 | |||
| 881 | p.options |= RightToLeft | ||
| 882 | nt = ntRequire | ||
| 883 | |||
| 884 | case '!': | ||
| 885 | if close == '\'' { | ||
| 886 | goto BreakRecognize | ||
| 887 | } | ||
| 888 | |||
| 889 | p.options |= RightToLeft | ||
| 890 | nt = ntPrevent | ||
| 891 | |||
| 892 | default: | ||
| 893 | p.moveLeft() | ||
| 894 | capnum := -1 | ||
| 895 | uncapnum := -1 | ||
| 896 | proceed := false | ||
| 897 | |||
| 898 | // grab part before - | ||
| 899 | |||
| 900 | if ch >= '0' && ch <= '9' { | ||
| 901 | if capnum, err = p.scanDecimal(); err != nil { | ||
| 902 | return nil, err | ||
| 903 | } | ||
| 904 | |||
| 905 | if !p.isCaptureSlot(capnum) { | ||
| 906 | capnum = -1 | ||
| 907 | } | ||
| 908 | |||
| 909 | // check if we have bogus characters after the number | ||
| 910 | if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { | ||
| 911 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 912 | } | ||
| 913 | if capnum == 0 { | ||
| 914 | return nil, p.getErr(ErrCapNumNotZero) | ||
| 915 | } | ||
| 916 | } else if IsWordChar(ch) { | ||
| 917 | capname := p.scanCapname() | ||
| 918 | |||
| 919 | if p.isCaptureName(capname) { | ||
| 920 | capnum = p.captureSlotFromName(capname) | ||
| 921 | } | ||
| 922 | |||
| 923 | // check if we have bogus character after the name | ||
| 924 | if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { | ||
| 925 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 926 | } | ||
| 927 | } else if ch == '-' { | ||
| 928 | proceed = true | ||
| 929 | } else { | ||
| 930 | // bad group name - starts with something other than a word character and isn't a number | ||
| 931 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 932 | } | ||
| 933 | |||
| 934 | // grab part after - if any | ||
| 935 | |||
| 936 | if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { | ||
| 937 | p.moveRight(1) | ||
| 938 | |||
| 939 | //no more chars left, no closing char, etc | ||
| 940 | if p.charsRight() == 0 { | ||
| 941 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 942 | } | ||
| 943 | |||
| 944 | ch = p.rightChar(0) | ||
| 945 | if ch >= '0' && ch <= '9' { | ||
| 946 | if uncapnum, err = p.scanDecimal(); err != nil { | ||
| 947 | return nil, err | ||
| 948 | } | ||
| 949 | |||
| 950 | if !p.isCaptureSlot(uncapnum) { | ||
| 951 | return nil, p.getErr(ErrUndefinedBackRef, uncapnum) | ||
| 952 | } | ||
| 953 | |||
| 954 | // check if we have bogus characters after the number | ||
| 955 | if p.charsRight() > 0 && p.rightChar(0) != close { | ||
| 956 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 957 | } | ||
| 958 | } else if IsWordChar(ch) { | ||
| 959 | uncapname := p.scanCapname() | ||
| 960 | |||
| 961 | if !p.isCaptureName(uncapname) { | ||
| 962 | return nil, p.getErr(ErrUndefinedNameRef, uncapname) | ||
| 963 | } | ||
| 964 | uncapnum = p.captureSlotFromName(uncapname) | ||
| 965 | |||
| 966 | // check if we have bogus character after the name | ||
| 967 | if p.charsRight() > 0 && p.rightChar(0) != close { | ||
| 968 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 969 | } | ||
| 970 | } else { | ||
| 971 | // bad group name - starts with something other than a word character and isn't a number | ||
| 972 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 973 | } | ||
| 974 | } | ||
| 975 | |||
| 976 | // actually make the node | ||
| 977 | |||
| 978 | if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close { | ||
| 979 | return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil | ||
| 980 | } | ||
| 981 | goto BreakRecognize | ||
| 982 | } | ||
| 983 | |||
| 984 | case '(': | ||
| 985 | // alternation construct (?(...) | ) | ||
| 986 | |||
| 987 | parenPos := p.textpos() | ||
| 988 | if p.charsRight() > 0 { | ||
| 989 | ch = p.rightChar(0) | ||
| 990 | |||
| 991 | // check if the alternation condition is a backref | ||
| 992 | if ch >= '0' && ch <= '9' { | ||
| 993 | var capnum int | ||
| 994 | if capnum, err = p.scanDecimal(); err != nil { | ||
| 995 | return nil, err | ||
| 996 | } | ||
| 997 | if p.charsRight() > 0 && p.moveRightGetChar() == ')' { | ||
| 998 | if p.isCaptureSlot(capnum) { | ||
| 999 | return newRegexNodeM(ntTestref, p.options, capnum), nil | ||
| 1000 | } | ||
| 1001 | return nil, p.getErr(ErrUndefinedReference, capnum) | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | return nil, p.getErr(ErrMalformedReference, capnum) | ||
| 1005 | |||
| 1006 | } else if IsWordChar(ch) { | ||
| 1007 | capname := p.scanCapname() | ||
| 1008 | |||
| 1009 | if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { | ||
| 1010 | return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil | ||
| 1011 | } | ||
| 1012 | } | ||
| 1013 | } | ||
| 1014 | // not a backref | ||
| 1015 | nt = ntTestgroup | ||
| 1016 | p.textto(parenPos - 1) // jump to the start of the parentheses | ||
| 1017 | p.ignoreNextParen = true // but make sure we don't try to capture the insides | ||
| 1018 | |||
| 1019 | charsRight := p.charsRight() | ||
| 1020 | if charsRight >= 3 && p.rightChar(1) == '?' { | ||
| 1021 | rightchar2 := p.rightChar(2) | ||
| 1022 | // disallow comments in the condition | ||
| 1023 | if rightchar2 == '#' { | ||
| 1024 | return nil, p.getErr(ErrAlternationCantHaveComment) | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | // disallow named capture group (?<..>..) in the condition | ||
| 1028 | if rightchar2 == '\'' { | ||
| 1029 | return nil, p.getErr(ErrAlternationCantCapture) | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') { | ||
| 1033 | return nil, p.getErr(ErrAlternationCantCapture) | ||
| 1034 | } | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | case 'P': | ||
| 1038 | if p.useRE2() { | ||
| 1039 | // support for P<name> syntax | ||
| 1040 | if p.charsRight() < 3 { | ||
| 1041 | goto BreakRecognize | ||
| 1042 | } | ||
| 1043 | |||
| 1044 | ch = p.moveRightGetChar() | ||
| 1045 | if ch != '<' { | ||
| 1046 | goto BreakRecognize | ||
| 1047 | } | ||
| 1048 | |||
| 1049 | ch = p.moveRightGetChar() | ||
| 1050 | p.moveLeft() | ||
| 1051 | |||
| 1052 | if IsWordChar(ch) { | ||
| 1053 | capnum := -1 | ||
| 1054 | capname := p.scanCapname() | ||
| 1055 | |||
| 1056 | if p.isCaptureName(capname) { | ||
| 1057 | capnum = p.captureSlotFromName(capname) | ||
| 1058 | } | ||
| 1059 | |||
| 1060 | // check if we have bogus character after the name | ||
| 1061 | if p.charsRight() > 0 && p.rightChar(0) != '>' { | ||
| 1062 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | // actually make the node | ||
| 1066 | |||
| 1067 | if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { | ||
| 1068 | return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil | ||
| 1069 | } | ||
| 1070 | goto BreakRecognize | ||
| 1071 | |||
| 1072 | } else { | ||
| 1073 | // bad group name - starts with something other than a word character and isn't a number | ||
| 1074 | return nil, p.getErr(ErrInvalidGroupName) | ||
| 1075 | } | ||
| 1076 | } | ||
| 1077 | // if we're not using RE2 compat mode then | ||
| 1078 | // we just behave like normal | ||
| 1079 | fallthrough | ||
| 1080 | |||
| 1081 | default: | ||
| 1082 | p.moveLeft() | ||
| 1083 | |||
| 1084 | nt = ntGroup | ||
| 1085 | // disallow options in the children of a testgroup node | ||
| 1086 | if p.group.t != ntTestgroup { | ||
| 1087 | p.scanOptions() | ||
| 1088 | } | ||
| 1089 | if p.charsRight() == 0 { | ||
| 1090 | goto BreakRecognize | ||
| 1091 | } | ||
| 1092 | |||
| 1093 | if ch = p.moveRightGetChar(); ch == ')' { | ||
| 1094 | return nil, nil | ||
| 1095 | } | ||
| 1096 | |||
| 1097 | if ch != ':' { | ||
| 1098 | goto BreakRecognize | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | } | ||
| 1102 | |||
| 1103 | return newRegexNode(nt, p.options), nil | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | BreakRecognize: | ||
| 1107 | |||
| 1108 | // break Recognize comes here | ||
| 1109 | |||
| 1110 | return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()])) | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | // scans backslash specials and basics | ||
| 1114 | func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { | ||
| 1115 | |||
| 1116 | if p.charsRight() == 0 { | ||
| 1117 | return nil, p.getErr(ErrIllegalEndEscape) | ||
| 1118 | } | ||
| 1119 | |||
| 1120 | switch ch := p.rightChar(0); ch { | ||
| 1121 | case 'b', 'B', 'A', 'G', 'Z', 'z': | ||
| 1122 | p.moveRight(1) | ||
| 1123 | return newRegexNode(p.typeFromCode(ch), p.options), nil | ||
| 1124 | |||
| 1125 | case 'w': | ||
| 1126 | p.moveRight(1) | ||
| 1127 | if p.useOptionE() || p.useRE2() { | ||
| 1128 | return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil | ||
| 1129 | } | ||
| 1130 | return newRegexNodeSet(ntSet, p.options, WordClass()), nil | ||
| 1131 | |||
| 1132 | case 'W': | ||
| 1133 | p.moveRight(1) | ||
| 1134 | if p.useOptionE() || p.useRE2() { | ||
| 1135 | return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil | ||
| 1136 | } | ||
| 1137 | return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil | ||
| 1138 | |||
| 1139 | case 's': | ||
| 1140 | p.moveRight(1) | ||
| 1141 | if p.useOptionE() { | ||
| 1142 | return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil | ||
| 1143 | } else if p.useRE2() { | ||
| 1144 | return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil | ||
| 1145 | } | ||
| 1146 | return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil | ||
| 1147 | |||
| 1148 | case 'S': | ||
| 1149 | p.moveRight(1) | ||
| 1150 | if p.useOptionE() { | ||
| 1151 | return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil | ||
| 1152 | } else if p.useRE2() { | ||
| 1153 | return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil | ||
| 1154 | } | ||
| 1155 | return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil | ||
| 1156 | |||
| 1157 | case 'd': | ||
| 1158 | p.moveRight(1) | ||
| 1159 | if p.useOptionE() || p.useRE2() { | ||
| 1160 | return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil | ||
| 1161 | } | ||
| 1162 | return newRegexNodeSet(ntSet, p.options, DigitClass()), nil | ||
| 1163 | |||
| 1164 | case 'D': | ||
| 1165 | p.moveRight(1) | ||
| 1166 | if p.useOptionE() || p.useRE2() { | ||
| 1167 | return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil | ||
| 1168 | } | ||
| 1169 | return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil | ||
| 1170 | |||
| 1171 | case 'p', 'P': | ||
| 1172 | p.moveRight(1) | ||
| 1173 | prop, err := p.parseProperty() | ||
| 1174 | if err != nil { | ||
| 1175 | return nil, err | ||
| 1176 | } | ||
| 1177 | cc := &CharSet{} | ||
| 1178 | cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) | ||
| 1179 | if p.useOptionI() { | ||
| 1180 | cc.addLowercase() | ||
| 1181 | } | ||
| 1182 | |||
| 1183 | return newRegexNodeSet(ntSet, p.options, cc), nil | ||
| 1184 | |||
| 1185 | default: | ||
| 1186 | return p.scanBasicBackslash(scanOnly) | ||
| 1187 | } | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | // Scans \-style backreferences and character escapes | ||
| 1191 | func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { | ||
| 1192 | if p.charsRight() == 0 { | ||
| 1193 | return nil, p.getErr(ErrIllegalEndEscape) | ||
| 1194 | } | ||
| 1195 | angled := false | ||
| 1196 | k := false | ||
| 1197 | close := '\x00' | ||
| 1198 | |||
| 1199 | backpos := p.textpos() | ||
| 1200 | ch := p.rightChar(0) | ||
| 1201 | |||
| 1202 | // Allow \k<foo> instead of \<foo>, which is now deprecated. | ||
| 1203 | |||
| 1204 | // According to ECMAScript specification, \k<name> is only parsed as a named group reference if | ||
| 1205 | // there is at least one group name in the regexp. | ||
| 1206 | // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7. | ||
| 1207 | // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok. | ||
| 1208 | if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) { | ||
| 1209 | if p.charsRight() >= 2 { | ||
| 1210 | p.moveRight(1) | ||
| 1211 | ch = p.moveRightGetChar() | ||
| 1212 | |||
| 1213 | if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript | ||
| 1214 | angled = true | ||
| 1215 | if ch == '\'' { | ||
| 1216 | close = '\'' | ||
| 1217 | } else { | ||
| 1218 | close = '>' | ||
| 1219 | } | ||
| 1220 | } | ||
| 1221 | } | ||
| 1222 | |||
| 1223 | if !angled || p.charsRight() <= 0 { | ||
| 1224 | return nil, p.getErr(ErrMalformedNameRef) | ||
| 1225 | } | ||
| 1226 | |||
| 1227 | ch = p.rightChar(0) | ||
| 1228 | k = true | ||
| 1229 | |||
| 1230 | } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g | ||
| 1231 | angled = true | ||
| 1232 | if ch == '\'' { | ||
| 1233 | close = '\'' | ||
| 1234 | } else { | ||
| 1235 | close = '>' | ||
| 1236 | } | ||
| 1237 | |||
| 1238 | p.moveRight(1) | ||
| 1239 | ch = p.rightChar(0) | ||
| 1240 | } | ||
| 1241 | |||
| 1242 | // Try to parse backreference: \<1> or \<cap> | ||
| 1243 | |||
| 1244 | if angled && ch >= '0' && ch <= '9' { | ||
| 1245 | capnum, err := p.scanDecimal() | ||
| 1246 | if err != nil { | ||
| 1247 | return nil, err | ||
| 1248 | } | ||
| 1249 | |||
| 1250 | if p.charsRight() > 0 && p.moveRightGetChar() == close { | ||
| 1251 | if p.isCaptureSlot(capnum) { | ||
| 1252 | return newRegexNodeM(ntRef, p.options, capnum), nil | ||
| 1253 | } | ||
| 1254 | return nil, p.getErr(ErrUndefinedBackRef, capnum) | ||
| 1255 | } | ||
| 1256 | } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1 | ||
| 1257 | capnum, err := p.scanDecimal() | ||
| 1258 | if err != nil { | ||
| 1259 | return nil, err | ||
| 1260 | } | ||
| 1261 | |||
| 1262 | if scanOnly { | ||
| 1263 | return nil, nil | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | if p.isCaptureSlot(capnum) { | ||
| 1267 | return newRegexNodeM(ntRef, p.options, capnum), nil | ||
| 1268 | } | ||
| 1269 | if capnum <= 9 && !p.useOptionE() { | ||
| 1270 | return nil, p.getErr(ErrUndefinedBackRef, capnum) | ||
| 1271 | } | ||
| 1272 | |||
| 1273 | } else if angled { | ||
| 1274 | capname := p.scanCapname() | ||
| 1275 | |||
| 1276 | if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close { | ||
| 1277 | |||
| 1278 | if scanOnly { | ||
| 1279 | return nil, nil | ||
| 1280 | } | ||
| 1281 | |||
| 1282 | if p.isCaptureName(capname) { | ||
| 1283 | return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil | ||
| 1284 | } | ||
| 1285 | return nil, p.getErr(ErrUndefinedNameRef, capname) | ||
| 1286 | } else { | ||
| 1287 | if k { | ||
| 1288 | return nil, p.getErr(ErrMalformedNameRef) | ||
| 1289 | } | ||
| 1290 | } | ||
| 1291 | } | ||
| 1292 | |||
| 1293 | // Not backreference: must be char code | ||
| 1294 | |||
| 1295 | p.textto(backpos) | ||
| 1296 | ch, err := p.scanCharEscape() | ||
| 1297 | if err != nil { | ||
| 1298 | return nil, err | ||
| 1299 | } | ||
| 1300 | |||
| 1301 | if scanOnly { | ||
| 1302 | return nil, nil | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | if p.useOptionI() { | ||
| 1306 | ch = unicode.ToLower(ch) | ||
| 1307 | } | ||
| 1308 | |||
| 1309 | return newRegexNodeCh(ntOne, p.options, ch), nil | ||
| 1310 | } | ||
| 1311 | |||
| 1312 | // Scans X for \p{X} or \P{X} | ||
| 1313 | func (p *parser) parseProperty() (string, error) { | ||
| 1314 | if p.charsRight() < 3 { | ||
| 1315 | return "", p.getErr(ErrIncompleteSlashP) | ||
| 1316 | } | ||
| 1317 | ch := p.moveRightGetChar() | ||
| 1318 | if ch != '{' { | ||
| 1319 | return "", p.getErr(ErrMalformedSlashP) | ||
| 1320 | } | ||
| 1321 | |||
| 1322 | startpos := p.textpos() | ||
| 1323 | for p.charsRight() > 0 { | ||
| 1324 | ch = p.moveRightGetChar() | ||
| 1325 | if !(IsWordChar(ch) || ch == '-') { | ||
| 1326 | p.moveLeft() | ||
| 1327 | break | ||
| 1328 | } | ||
| 1329 | } | ||
| 1330 | capname := string(p.pattern[startpos:p.textpos()]) | ||
| 1331 | |||
| 1332 | if p.charsRight() == 0 || p.moveRightGetChar() != '}' { | ||
| 1333 | return "", p.getErr(ErrIncompleteSlashP) | ||
| 1334 | } | ||
| 1335 | |||
| 1336 | if !isValidUnicodeCat(capname) { | ||
| 1337 | return "", p.getErr(ErrUnknownSlashP, capname) | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | return capname, nil | ||
| 1341 | } | ||
| 1342 | |||
| 1343 | // Returns ReNode type for zero-length assertions with a \ code. | ||
| 1344 | func (p *parser) typeFromCode(ch rune) nodeType { | ||
| 1345 | switch ch { | ||
| 1346 | case 'b': | ||
| 1347 | if p.useOptionE() { | ||
| 1348 | return ntECMABoundary | ||
| 1349 | } | ||
| 1350 | return ntBoundary | ||
| 1351 | case 'B': | ||
| 1352 | if p.useOptionE() { | ||
| 1353 | return ntNonECMABoundary | ||
| 1354 | } | ||
| 1355 | return ntNonboundary | ||
| 1356 | case 'A': | ||
| 1357 | return ntBeginning | ||
| 1358 | case 'G': | ||
| 1359 | return ntStart | ||
| 1360 | case 'Z': | ||
| 1361 | return ntEndZ | ||
| 1362 | case 'z': | ||
| 1363 | return ntEnd | ||
| 1364 | default: | ||
| 1365 | return ntNothing | ||
| 1366 | } | ||
| 1367 | } | ||
| 1368 | |||
| 1369 | // Scans whitespace or x-mode comments. | ||
| 1370 | func (p *parser) scanBlank() error { | ||
| 1371 | if p.useOptionX() { | ||
| 1372 | for { | ||
| 1373 | for p.charsRight() > 0 && isSpace(p.rightChar(0)) { | ||
| 1374 | p.moveRight(1) | ||
| 1375 | } | ||
| 1376 | |||
| 1377 | if p.charsRight() == 0 { | ||
| 1378 | break | ||
| 1379 | } | ||
| 1380 | |||
| 1381 | if p.rightChar(0) == '#' { | ||
| 1382 | for p.charsRight() > 0 && p.rightChar(0) != '\n' { | ||
| 1383 | p.moveRight(1) | ||
| 1384 | } | ||
| 1385 | } else if p.charsRight() >= 3 && p.rightChar(2) == '#' && | ||
| 1386 | p.rightChar(1) == '?' && p.rightChar(0) == '(' { | ||
| 1387 | for p.charsRight() > 0 && p.rightChar(0) != ')' { | ||
| 1388 | p.moveRight(1) | ||
| 1389 | } | ||
| 1390 | if p.charsRight() == 0 { | ||
| 1391 | return p.getErr(ErrUnterminatedComment) | ||
| 1392 | } | ||
| 1393 | p.moveRight(1) | ||
| 1394 | } else { | ||
| 1395 | break | ||
| 1396 | } | ||
| 1397 | } | ||
| 1398 | } else { | ||
| 1399 | for { | ||
| 1400 | if p.charsRight() < 3 || p.rightChar(2) != '#' || | ||
| 1401 | p.rightChar(1) != '?' || p.rightChar(0) != '(' { | ||
| 1402 | return nil | ||
| 1403 | } | ||
| 1404 | |||
| 1405 | for p.charsRight() > 0 && p.rightChar(0) != ')' { | ||
| 1406 | p.moveRight(1) | ||
| 1407 | } | ||
| 1408 | if p.charsRight() == 0 { | ||
| 1409 | return p.getErr(ErrUnterminatedComment) | ||
| 1410 | } | ||
| 1411 | p.moveRight(1) | ||
| 1412 | } | ||
| 1413 | } | ||
| 1414 | return nil | ||
| 1415 | } | ||
| 1416 | |||
| 1417 | func (p *parser) scanCapname() string { | ||
| 1418 | startpos := p.textpos() | ||
| 1419 | |||
| 1420 | for p.charsRight() > 0 { | ||
| 1421 | if !IsWordChar(p.moveRightGetChar()) { | ||
| 1422 | p.moveLeft() | ||
| 1423 | break | ||
| 1424 | } | ||
| 1425 | } | ||
| 1426 | |||
| 1427 | return string(p.pattern[startpos:p.textpos()]) | ||
| 1428 | } | ||
| 1429 | |||
| 1430 | //Scans contents of [] (not including []'s), and converts to a set. | ||
| 1431 | func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { | ||
| 1432 | ch := '\x00' | ||
| 1433 | chPrev := '\x00' | ||
| 1434 | inRange := false | ||
| 1435 | firstChar := true | ||
| 1436 | closed := false | ||
| 1437 | |||
| 1438 | var cc *CharSet | ||
| 1439 | if !scanOnly { | ||
| 1440 | cc = &CharSet{} | ||
| 1441 | } | ||
| 1442 | |||
| 1443 | if p.charsRight() > 0 && p.rightChar(0) == '^' { | ||
| 1444 | p.moveRight(1) | ||
| 1445 | if !scanOnly { | ||
| 1446 | cc.negate = true | ||
| 1447 | } | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | for ; p.charsRight() > 0; firstChar = false { | ||
| 1451 | fTranslatedChar := false | ||
| 1452 | ch = p.moveRightGetChar() | ||
| 1453 | if ch == ']' { | ||
| 1454 | if !firstChar { | ||
| 1455 | closed = true | ||
| 1456 | break | ||
| 1457 | } else if p.useOptionE() { | ||
| 1458 | if !scanOnly { | ||
| 1459 | cc.addRanges(NoneClass().ranges) | ||
| 1460 | } | ||
| 1461 | closed = true | ||
| 1462 | break | ||
| 1463 | } | ||
| 1464 | |||
| 1465 | } else if ch == '\\' && p.charsRight() > 0 { | ||
| 1466 | switch ch = p.moveRightGetChar(); ch { | ||
| 1467 | case 'D', 'd': | ||
| 1468 | if !scanOnly { | ||
| 1469 | if inRange { | ||
| 1470 | return nil, p.getErr(ErrBadClassInCharRange, ch) | ||
| 1471 | } | ||
| 1472 | cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw) | ||
| 1473 | } | ||
| 1474 | continue | ||
| 1475 | |||
| 1476 | case 'S', 's': | ||
| 1477 | if !scanOnly { | ||
| 1478 | if inRange { | ||
| 1479 | return nil, p.getErr(ErrBadClassInCharRange, ch) | ||
| 1480 | } | ||
| 1481 | cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S') | ||
| 1482 | } | ||
| 1483 | continue | ||
| 1484 | |||
| 1485 | case 'W', 'w': | ||
| 1486 | if !scanOnly { | ||
| 1487 | if inRange { | ||
| 1488 | return nil, p.getErr(ErrBadClassInCharRange, ch) | ||
| 1489 | } | ||
| 1490 | |||
| 1491 | cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W') | ||
| 1492 | } | ||
| 1493 | continue | ||
| 1494 | |||
| 1495 | case 'p', 'P': | ||
| 1496 | if !scanOnly { | ||
| 1497 | if inRange { | ||
| 1498 | return nil, p.getErr(ErrBadClassInCharRange, ch) | ||
| 1499 | } | ||
| 1500 | prop, err := p.parseProperty() | ||
| 1501 | if err != nil { | ||
| 1502 | return nil, err | ||
| 1503 | } | ||
| 1504 | cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) | ||
| 1505 | } else { | ||
| 1506 | p.parseProperty() | ||
| 1507 | } | ||
| 1508 | |||
| 1509 | continue | ||
| 1510 | |||
| 1511 | case '-': | ||
| 1512 | if !scanOnly { | ||
| 1513 | cc.addRange(ch, ch) | ||
| 1514 | } | ||
| 1515 | continue | ||
| 1516 | |||
| 1517 | default: | ||
| 1518 | p.moveLeft() | ||
| 1519 | var err error | ||
| 1520 | ch, err = p.scanCharEscape() // non-literal character | ||
| 1521 | if err != nil { | ||
| 1522 | return nil, err | ||
| 1523 | } | ||
| 1524 | fTranslatedChar = true | ||
| 1525 | break // this break will only break out of the switch | ||
| 1526 | } | ||
| 1527 | } else if ch == '[' { | ||
| 1528 | // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. | ||
| 1529 | // It currently doesn't do anything other than skip the whole thing! | ||
| 1530 | if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange { | ||
| 1531 | savePos := p.textpos() | ||
| 1532 | |||
| 1533 | p.moveRight(1) | ||
| 1534 | negate := false | ||
| 1535 | if p.charsRight() > 1 && p.rightChar(0) == '^' { | ||
| 1536 | negate = true | ||
| 1537 | p.moveRight(1) | ||
| 1538 | } | ||
| 1539 | |||
| 1540 | nm := p.scanCapname() // snag the name | ||
| 1541 | if !scanOnly && p.useRE2() { | ||
| 1542 | // look up the name since these are valid for RE2 | ||
| 1543 | // add the group based on the name | ||
| 1544 | if ok := cc.addNamedASCII(nm, negate); !ok { | ||
| 1545 | return nil, p.getErr(ErrInvalidCharRange) | ||
| 1546 | } | ||
| 1547 | } | ||
| 1548 | if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' { | ||
| 1549 | p.textto(savePos) | ||
| 1550 | } else if p.useRE2() { | ||
| 1551 | // move on | ||
| 1552 | continue | ||
| 1553 | } | ||
| 1554 | } | ||
| 1555 | } | ||
| 1556 | |||
| 1557 | if inRange { | ||
| 1558 | inRange = false | ||
| 1559 | if !scanOnly { | ||
| 1560 | if ch == '[' && !fTranslatedChar && !firstChar { | ||
| 1561 | // We thought we were in a range, but we're actually starting a subtraction. | ||
| 1562 | // In that case, we'll add chPrev to our char class, skip the opening [, and | ||
| 1563 | // scan the new character class recursively. | ||
| 1564 | cc.addChar(chPrev) | ||
| 1565 | sub, err := p.scanCharSet(caseInsensitive, false) | ||
| 1566 | if err != nil { | ||
| 1567 | return nil, err | ||
| 1568 | } | ||
| 1569 | cc.addSubtraction(sub) | ||
| 1570 | |||
| 1571 | if p.charsRight() > 0 && p.rightChar(0) != ']' { | ||
| 1572 | return nil, p.getErr(ErrSubtractionMustBeLast) | ||
| 1573 | } | ||
| 1574 | } else { | ||
| 1575 | // a regular range, like a-z | ||
| 1576 | if chPrev > ch { | ||
| 1577 | return nil, p.getErr(ErrReversedCharRange, chPrev, ch) | ||
| 1578 | } | ||
| 1579 | cc.addRange(chPrev, ch) | ||
| 1580 | } | ||
| 1581 | } | ||
| 1582 | } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { | ||
| 1583 | // this could be the start of a range | ||
| 1584 | chPrev = ch | ||
| 1585 | inRange = true | ||
| 1586 | p.moveRight(1) | ||
| 1587 | } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar { | ||
| 1588 | // we aren't in a range, and now there is a subtraction. Usually this happens | ||
| 1589 | // only when a subtraction follows a range, like [a-z-[b]] | ||
| 1590 | if !scanOnly { | ||
| 1591 | p.moveRight(1) | ||
| 1592 | sub, err := p.scanCharSet(caseInsensitive, false) | ||
| 1593 | if err != nil { | ||
| 1594 | return nil, err | ||
| 1595 | } | ||
| 1596 | cc.addSubtraction(sub) | ||
| 1597 | |||
| 1598 | if p.charsRight() > 0 && p.rightChar(0) != ']' { | ||
| 1599 | return nil, p.getErr(ErrSubtractionMustBeLast) | ||
| 1600 | } | ||
| 1601 | } else { | ||
| 1602 | p.moveRight(1) | ||
| 1603 | p.scanCharSet(caseInsensitive, true) | ||
| 1604 | } | ||
| 1605 | } else { | ||
| 1606 | if !scanOnly { | ||
| 1607 | cc.addRange(ch, ch) | ||
| 1608 | } | ||
| 1609 | } | ||
| 1610 | } | ||
| 1611 | |||
| 1612 | if !closed { | ||
| 1613 | return nil, p.getErr(ErrUnterminatedBracket) | ||
| 1614 | } | ||
| 1615 | |||
| 1616 | if !scanOnly && caseInsensitive { | ||
| 1617 | cc.addLowercase() | ||
| 1618 | } | ||
| 1619 | |||
| 1620 | return cc, nil | ||
| 1621 | } | ||
| 1622 | |||
| 1623 | // Scans any number of decimal digits (pegs value at 2^31-1 if too large) | ||
| 1624 | func (p *parser) scanDecimal() (int, error) { | ||
| 1625 | i := 0 | ||
| 1626 | var d int | ||
| 1627 | |||
| 1628 | for p.charsRight() > 0 { | ||
| 1629 | d = int(p.rightChar(0) - '0') | ||
| 1630 | if d < 0 || d > 9 { | ||
| 1631 | break | ||
| 1632 | } | ||
| 1633 | p.moveRight(1) | ||
| 1634 | |||
| 1635 | if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) { | ||
| 1636 | return 0, p.getErr(ErrCaptureGroupOutOfRange) | ||
| 1637 | } | ||
| 1638 | |||
| 1639 | i *= 10 | ||
| 1640 | i += d | ||
| 1641 | } | ||
| 1642 | |||
| 1643 | return int(i), nil | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | // Returns true for options allowed only at the top level | ||
| 1647 | func isOnlyTopOption(option RegexOptions) bool { | ||
| 1648 | return option == RightToLeft || option == ECMAScript || option == RE2 | ||
| 1649 | } | ||
| 1650 | |||
| 1651 | // Scans cimsx-cimsx option string, stops at the first unrecognized char. | ||
| 1652 | func (p *parser) scanOptions() { | ||
| 1653 | |||
| 1654 | for off := false; p.charsRight() > 0; p.moveRight(1) { | ||
| 1655 | ch := p.rightChar(0) | ||
| 1656 | |||
| 1657 | if ch == '-' { | ||
| 1658 | off = true | ||
| 1659 | } else if ch == '+' { | ||
| 1660 | off = false | ||
| 1661 | } else { | ||
| 1662 | option := optionFromCode(ch) | ||
| 1663 | if option == 0 || isOnlyTopOption(option) { | ||
| 1664 | return | ||
| 1665 | } | ||
| 1666 | |||
| 1667 | if off { | ||
| 1668 | p.options &= ^option | ||
| 1669 | } else { | ||
| 1670 | p.options |= option | ||
| 1671 | } | ||
| 1672 | } | ||
| 1673 | } | ||
| 1674 | } | ||
| 1675 | |||
| 1676 | // Scans \ code for escape codes that map to single unicode chars. | ||
| 1677 | func (p *parser) scanCharEscape() (r rune, err error) { | ||
| 1678 | |||
| 1679 | ch := p.moveRightGetChar() | ||
| 1680 | |||
| 1681 | if ch >= '0' && ch <= '7' { | ||
| 1682 | p.moveLeft() | ||
| 1683 | return p.scanOctal(), nil | ||
| 1684 | } | ||
| 1685 | |||
| 1686 | pos := p.textpos() | ||
| 1687 | |||
| 1688 | switch ch { | ||
| 1689 | case 'x': | ||
| 1690 | // support for \x{HEX} syntax from Perl and PCRE | ||
| 1691 | if p.charsRight() > 0 && p.rightChar(0) == '{' { | ||
| 1692 | if p.useOptionE() { | ||
| 1693 | return ch, nil | ||
| 1694 | } | ||
| 1695 | p.moveRight(1) | ||
| 1696 | return p.scanHexUntilBrace() | ||
| 1697 | } else { | ||
| 1698 | r, err = p.scanHex(2) | ||
| 1699 | } | ||
| 1700 | case 'u': | ||
| 1701 | // ECMAscript suppot \u{HEX} only if `u` is also set | ||
| 1702 | if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' { | ||
| 1703 | p.moveRight(1) | ||
| 1704 | return p.scanHexUntilBrace() | ||
| 1705 | } else { | ||
| 1706 | r, err = p.scanHex(4) | ||
| 1707 | } | ||
| 1708 | case 'a': | ||
| 1709 | return '\u0007', nil | ||
| 1710 | case 'b': | ||
| 1711 | return '\b', nil | ||
| 1712 | case 'e': | ||
| 1713 | return '\u001B', nil | ||
| 1714 | case 'f': | ||
| 1715 | return '\f', nil | ||
| 1716 | case 'n': | ||
| 1717 | return '\n', nil | ||
| 1718 | case 'r': | ||
| 1719 | return '\r', nil | ||
| 1720 | case 't': | ||
| 1721 | return '\t', nil | ||
| 1722 | case 'v': | ||
| 1723 | return '\u000B', nil | ||
| 1724 | case 'c': | ||
| 1725 | r, err = p.scanControl() | ||
| 1726 | default: | ||
| 1727 | if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) { | ||
| 1728 | return 0, p.getErr(ErrUnrecognizedEscape, string(ch)) | ||
| 1729 | } | ||
| 1730 | return ch, nil | ||
| 1731 | } | ||
| 1732 | if err != nil && p.useOptionE() { | ||
| 1733 | p.textto(pos) | ||
| 1734 | return ch, nil | ||
| 1735 | } | ||
| 1736 | return | ||
| 1737 | } | ||
| 1738 | |||
| 1739 | // Grabs and converts an ascii control character | ||
| 1740 | func (p *parser) scanControl() (rune, error) { | ||
| 1741 | if p.charsRight() <= 0 { | ||
| 1742 | return 0, p.getErr(ErrMissingControl) | ||
| 1743 | } | ||
| 1744 | |||
| 1745 | ch := p.moveRightGetChar() | ||
| 1746 | |||
| 1747 | // \ca interpreted as \cA | ||
| 1748 | |||
| 1749 | if ch >= 'a' && ch <= 'z' { | ||
| 1750 | ch = (ch - ('a' - 'A')) | ||
| 1751 | } | ||
| 1752 | ch = (ch - '@') | ||
| 1753 | if ch >= 0 && ch < ' ' { | ||
| 1754 | return ch, nil | ||
| 1755 | } | ||
| 1756 | |||
| 1757 | return 0, p.getErr(ErrUnrecognizedControl) | ||
| 1758 | |||
| 1759 | } | ||
| 1760 | |||
| 1761 | // Scan hex digits until we hit a closing brace. | ||
| 1762 | // Non-hex digits, hex value too large for UTF-8, or running out of chars are errors | ||
| 1763 | func (p *parser) scanHexUntilBrace() (rune, error) { | ||
| 1764 | // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit | ||
| 1765 | // so we can enforce that | ||
| 1766 | i := 0 | ||
| 1767 | hasContent := false | ||
| 1768 | |||
| 1769 | for p.charsRight() > 0 { | ||
| 1770 | ch := p.moveRightGetChar() | ||
| 1771 | if ch == '}' { | ||
| 1772 | // hit our close brace, we're done here | ||
| 1773 | // prevent \x{} | ||
| 1774 | if !hasContent { | ||
| 1775 | return 0, p.getErr(ErrTooFewHex) | ||
| 1776 | } | ||
| 1777 | return rune(i), nil | ||
| 1778 | } | ||
| 1779 | hasContent = true | ||
| 1780 | // no brace needs to be hex digit | ||
| 1781 | d := hexDigit(ch) | ||
| 1782 | if d < 0 { | ||
| 1783 | return 0, p.getErr(ErrMissingBrace) | ||
| 1784 | } | ||
| 1785 | |||
| 1786 | i *= 0x10 | ||
| 1787 | i += d | ||
| 1788 | |||
| 1789 | if i > unicode.MaxRune { | ||
| 1790 | return 0, p.getErr(ErrInvalidHex) | ||
| 1791 | } | ||
| 1792 | } | ||
| 1793 | |||
| 1794 | // we only make it here if we run out of digits without finding the brace | ||
| 1795 | return 0, p.getErr(ErrMissingBrace) | ||
| 1796 | } | ||
| 1797 | |||
| 1798 | // Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) | ||
| 1799 | func (p *parser) scanHex(c int) (rune, error) { | ||
| 1800 | |||
| 1801 | i := 0 | ||
| 1802 | |||
| 1803 | if p.charsRight() >= c { | ||
| 1804 | for c > 0 { | ||
| 1805 | d := hexDigit(p.moveRightGetChar()) | ||
| 1806 | if d < 0 { | ||
| 1807 | break | ||
| 1808 | } | ||
| 1809 | i *= 0x10 | ||
| 1810 | i += d | ||
| 1811 | c-- | ||
| 1812 | } | ||
| 1813 | } | ||
| 1814 | |||
| 1815 | if c > 0 { | ||
| 1816 | return 0, p.getErr(ErrTooFewHex) | ||
| 1817 | } | ||
| 1818 | |||
| 1819 | return rune(i), nil | ||
| 1820 | } | ||
| 1821 | |||
| 1822 | // Returns n <= 0xF for a hex digit. | ||
| 1823 | func hexDigit(ch rune) int { | ||
| 1824 | |||
| 1825 | if d := uint(ch - '0'); d <= 9 { | ||
| 1826 | return int(d) | ||
| 1827 | } | ||
| 1828 | |||
| 1829 | if d := uint(ch - 'a'); d <= 5 { | ||
| 1830 | return int(d + 0xa) | ||
| 1831 | } | ||
| 1832 | |||
| 1833 | if d := uint(ch - 'A'); d <= 5 { | ||
| 1834 | return int(d + 0xa) | ||
| 1835 | } | ||
| 1836 | |||
| 1837 | return -1 | ||
| 1838 | } | ||
| 1839 | |||
| 1840 | // Scans up to three octal digits (stops before exceeding 0377). | ||
| 1841 | func (p *parser) scanOctal() rune { | ||
| 1842 | // Consume octal chars only up to 3 digits and value 0377 | ||
| 1843 | |||
| 1844 | c := 3 | ||
| 1845 | |||
| 1846 | if c > p.charsRight() { | ||
| 1847 | c = p.charsRight() | ||
| 1848 | } | ||
| 1849 | |||
| 1850 | //we know the first char is good because the caller had to check | ||
| 1851 | i := 0 | ||
| 1852 | d := int(p.rightChar(0) - '0') | ||
| 1853 | for c > 0 && d <= 7 && d >= 0 { | ||
| 1854 | if i >= 0x20 && p.useOptionE() { | ||
| 1855 | break | ||
| 1856 | } | ||
| 1857 | i *= 8 | ||
| 1858 | i += d | ||
| 1859 | c-- | ||
| 1860 | |||
| 1861 | p.moveRight(1) | ||
| 1862 | if !p.rightMost() { | ||
| 1863 | d = int(p.rightChar(0) - '0') | ||
| 1864 | } | ||
| 1865 | } | ||
| 1866 | |||
| 1867 | // Octal codes only go up to 255. Any larger and the behavior that Perl follows | ||
| 1868 | // is simply to truncate the high bits. | ||
| 1869 | i &= 0xFF | ||
| 1870 | |||
| 1871 | return rune(i) | ||
| 1872 | } | ||
| 1873 | |||
| 1874 | // Returns the current parsing position. | ||
| 1875 | func (p *parser) textpos() int { | ||
| 1876 | return p.currentPos | ||
| 1877 | } | ||
| 1878 | |||
| 1879 | // Zaps to a specific parsing position. | ||
| 1880 | func (p *parser) textto(pos int) { | ||
| 1881 | p.currentPos = pos | ||
| 1882 | } | ||
| 1883 | |||
| 1884 | // Returns the char at the right of the current parsing position and advances to the right. | ||
| 1885 | func (p *parser) moveRightGetChar() rune { | ||
| 1886 | ch := p.pattern[p.currentPos] | ||
| 1887 | p.currentPos++ | ||
| 1888 | return ch | ||
| 1889 | } | ||
| 1890 | |||
| 1891 | // Moves the current position to the right. | ||
| 1892 | func (p *parser) moveRight(i int) { | ||
| 1893 | // default would be 1 | ||
| 1894 | p.currentPos += i | ||
| 1895 | } | ||
| 1896 | |||
| 1897 | // Moves the current parsing position one to the left. | ||
| 1898 | func (p *parser) moveLeft() { | ||
| 1899 | p.currentPos-- | ||
| 1900 | } | ||
| 1901 | |||
| 1902 | // Returns the char left of the current parsing position. | ||
| 1903 | func (p *parser) charAt(i int) rune { | ||
| 1904 | return p.pattern[i] | ||
| 1905 | } | ||
| 1906 | |||
| 1907 | // Returns the char i chars right of the current parsing position. | ||
| 1908 | func (p *parser) rightChar(i int) rune { | ||
| 1909 | // default would be 0 | ||
| 1910 | return p.pattern[p.currentPos+i] | ||
| 1911 | } | ||
| 1912 | |||
| 1913 | // Number of characters to the right of the current parsing position. | ||
| 1914 | func (p *parser) charsRight() int { | ||
| 1915 | return len(p.pattern) - p.currentPos | ||
| 1916 | } | ||
| 1917 | |||
| 1918 | func (p *parser) rightMost() bool { | ||
| 1919 | return p.currentPos == len(p.pattern) | ||
| 1920 | } | ||
| 1921 | |||
| 1922 | // Looks up the slot number for a given name | ||
| 1923 | func (p *parser) captureSlotFromName(capname string) int { | ||
| 1924 | return p.capnames[capname] | ||
| 1925 | } | ||
| 1926 | |||
| 1927 | // True if the capture slot was noted | ||
| 1928 | func (p *parser) isCaptureSlot(i int) bool { | ||
| 1929 | if p.caps != nil { | ||
| 1930 | _, ok := p.caps[i] | ||
| 1931 | return ok | ||
| 1932 | } | ||
| 1933 | |||
| 1934 | return (i >= 0 && i < p.capsize) | ||
| 1935 | } | ||
| 1936 | |||
| 1937 | // Looks up the slot number for a given name | ||
| 1938 | func (p *parser) isCaptureName(capname string) bool { | ||
| 1939 | if p.capnames == nil { | ||
| 1940 | return false | ||
| 1941 | } | ||
| 1942 | |||
| 1943 | _, ok := p.capnames[capname] | ||
| 1944 | return ok | ||
| 1945 | } | ||
| 1946 | |||
| 1947 | // option shortcuts | ||
| 1948 | |||
| 1949 | // True if N option disabling '(' autocapture is on. | ||
| 1950 | func (p *parser) useOptionN() bool { | ||
| 1951 | return (p.options & ExplicitCapture) != 0 | ||
| 1952 | } | ||
| 1953 | |||
| 1954 | // True if I option enabling case-insensitivity is on. | ||
| 1955 | func (p *parser) useOptionI() bool { | ||
| 1956 | return (p.options & IgnoreCase) != 0 | ||
| 1957 | } | ||
| 1958 | |||
| 1959 | // True if M option altering meaning of $ and ^ is on. | ||
| 1960 | func (p *parser) useOptionM() bool { | ||
| 1961 | return (p.options & Multiline) != 0 | ||
| 1962 | } | ||
| 1963 | |||
| 1964 | // True if S option altering meaning of . is on. | ||
| 1965 | func (p *parser) useOptionS() bool { | ||
| 1966 | return (p.options & Singleline) != 0 | ||
| 1967 | } | ||
| 1968 | |||
| 1969 | // True if X option enabling whitespace/comment mode is on. | ||
| 1970 | func (p *parser) useOptionX() bool { | ||
| 1971 | return (p.options & IgnorePatternWhitespace) != 0 | ||
| 1972 | } | ||
| 1973 | |||
| 1974 | // True if E option enabling ECMAScript behavior on. | ||
| 1975 | func (p *parser) useOptionE() bool { | ||
| 1976 | return (p.options & ECMAScript) != 0 | ||
| 1977 | } | ||
| 1978 | |||
| 1979 | // true to use RE2 compatibility parsing behavior. | ||
| 1980 | func (p *parser) useRE2() bool { | ||
| 1981 | return (p.options & RE2) != 0 | ||
| 1982 | } | ||
| 1983 | |||
| 1984 | // True if U option enabling ECMAScript's Unicode behavior on. | ||
| 1985 | func (p *parser) useOptionU() bool { | ||
| 1986 | return (p.options & Unicode) != 0 | ||
| 1987 | } | ||
| 1988 | |||
| 1989 | // True if options stack is empty. | ||
| 1990 | func (p *parser) emptyOptionsStack() bool { | ||
| 1991 | return len(p.optionsStack) == 0 | ||
| 1992 | } | ||
| 1993 | |||
| 1994 | // Finish the current quantifiable (when a quantifier is not found or is not possible) | ||
| 1995 | func (p *parser) addConcatenate() { | ||
| 1996 | // The first (| inside a Testgroup group goes directly to the group | ||
| 1997 | p.concatenation.addChild(p.unit) | ||
| 1998 | p.unit = nil | ||
| 1999 | } | ||
| 2000 | |||
| 2001 | // Finish the current quantifiable (when a quantifier is found) | ||
| 2002 | func (p *parser) addConcatenate3(lazy bool, min, max int) { | ||
| 2003 | p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max)) | ||
| 2004 | p.unit = nil | ||
| 2005 | } | ||
| 2006 | |||
| 2007 | // Sets the current unit to a single char node | ||
| 2008 | func (p *parser) addUnitOne(ch rune) { | ||
| 2009 | if p.useOptionI() { | ||
| 2010 | ch = unicode.ToLower(ch) | ||
| 2011 | } | ||
| 2012 | |||
| 2013 | p.unit = newRegexNodeCh(ntOne, p.options, ch) | ||
| 2014 | } | ||
| 2015 | |||
| 2016 | // Sets the current unit to a single inverse-char node | ||
| 2017 | func (p *parser) addUnitNotone(ch rune) { | ||
| 2018 | if p.useOptionI() { | ||
| 2019 | ch = unicode.ToLower(ch) | ||
| 2020 | } | ||
| 2021 | |||
| 2022 | p.unit = newRegexNodeCh(ntNotone, p.options, ch) | ||
| 2023 | } | ||
| 2024 | |||
| 2025 | // Sets the current unit to a single set node | ||
| 2026 | func (p *parser) addUnitSet(set *CharSet) { | ||
| 2027 | p.unit = newRegexNodeSet(ntSet, p.options, set) | ||
| 2028 | } | ||
| 2029 | |||
| 2030 | // Sets the current unit to a subtree | ||
| 2031 | func (p *parser) addUnitNode(node *regexNode) { | ||
| 2032 | p.unit = node | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | // Sets the current unit to an assertion of the specified type | ||
| 2036 | func (p *parser) addUnitType(t nodeType) { | ||
| 2037 | p.unit = newRegexNode(t, p.options) | ||
| 2038 | } | ||
| 2039 | |||
| 2040 | // Finish the current group (in response to a ')' or end) | ||
| 2041 | func (p *parser) addGroup() error { | ||
| 2042 | if p.group.t == ntTestgroup || p.group.t == ntTestref { | ||
| 2043 | p.group.addChild(p.concatenation.reverseLeft()) | ||
| 2044 | if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 { | ||
| 2045 | return p.getErr(ErrTooManyAlternates) | ||
| 2046 | } | ||
| 2047 | } else { | ||
| 2048 | p.alternation.addChild(p.concatenation.reverseLeft()) | ||
| 2049 | p.group.addChild(p.alternation) | ||
| 2050 | } | ||
| 2051 | |||
| 2052 | p.unit = p.group | ||
| 2053 | return nil | ||
| 2054 | } | ||
| 2055 | |||
| 2056 | // Pops the option stack, but keeps the current options unchanged. | ||
| 2057 | func (p *parser) popKeepOptions() { | ||
| 2058 | lastIdx := len(p.optionsStack) - 1 | ||
| 2059 | p.optionsStack = p.optionsStack[:lastIdx] | ||
| 2060 | } | ||
| 2061 | |||
| 2062 | // Recalls options from the stack. | ||
| 2063 | func (p *parser) popOptions() { | ||
| 2064 | lastIdx := len(p.optionsStack) - 1 | ||
| 2065 | // get the last item on the stack and then remove it by reslicing | ||
| 2066 | p.options = p.optionsStack[lastIdx] | ||
| 2067 | p.optionsStack = p.optionsStack[:lastIdx] | ||
| 2068 | } | ||
| 2069 | |||
| 2070 | // Saves options on a stack. | ||
| 2071 | func (p *parser) pushOptions() { | ||
| 2072 | p.optionsStack = append(p.optionsStack, p.options) | ||
| 2073 | } | ||
| 2074 | |||
| 2075 | // Add a string to the last concatenate. | ||
| 2076 | func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) { | ||
| 2077 | var node *regexNode | ||
| 2078 | |||
| 2079 | if cch == 0 { | ||
| 2080 | return | ||
| 2081 | } | ||
| 2082 | |||
| 2083 | if cch > 1 { | ||
| 2084 | str := make([]rune, cch) | ||
| 2085 | copy(str, p.pattern[pos:pos+cch]) | ||
| 2086 | |||
| 2087 | if p.useOptionI() && !isReplacement { | ||
| 2088 | // We do the ToLower character by character for consistency. With surrogate chars, doing | ||
| 2089 | // a ToLower on the entire string could actually change the surrogate pair. This is more correct | ||
| 2090 | // linguistically, but since Regex doesn't support surrogates, it's more important to be | ||
| 2091 | // consistent. | ||
| 2092 | for i := 0; i < len(str); i++ { | ||
| 2093 | str[i] = unicode.ToLower(str[i]) | ||
| 2094 | } | ||
| 2095 | } | ||
| 2096 | |||
| 2097 | node = newRegexNodeStr(ntMulti, p.options, str) | ||
| 2098 | } else { | ||
| 2099 | ch := p.charAt(pos) | ||
| 2100 | |||
| 2101 | if p.useOptionI() && !isReplacement { | ||
| 2102 | ch = unicode.ToLower(ch) | ||
| 2103 | } | ||
| 2104 | |||
| 2105 | node = newRegexNodeCh(ntOne, p.options, ch) | ||
| 2106 | } | ||
| 2107 | |||
| 2108 | p.concatenation.addChild(node) | ||
| 2109 | } | ||
| 2110 | |||
| 2111 | // Push the parser state (in response to an open paren) | ||
| 2112 | func (p *parser) pushGroup() { | ||
| 2113 | p.group.next = p.stack | ||
| 2114 | p.alternation.next = p.group | ||
| 2115 | p.concatenation.next = p.alternation | ||
| 2116 | p.stack = p.concatenation | ||
| 2117 | } | ||
| 2118 | |||
| 2119 | // Remember the pushed state (in response to a ')') | ||
| 2120 | func (p *parser) popGroup() error { | ||
| 2121 | p.concatenation = p.stack | ||
| 2122 | p.alternation = p.concatenation.next | ||
| 2123 | p.group = p.alternation.next | ||
| 2124 | p.stack = p.group.next | ||
| 2125 | |||
| 2126 | // The first () inside a Testgroup group goes directly to the group | ||
| 2127 | if p.group.t == ntTestgroup && len(p.group.children) == 0 { | ||
| 2128 | if p.unit == nil { | ||
| 2129 | return p.getErr(ErrConditionalExpression) | ||
| 2130 | } | ||
| 2131 | |||
| 2132 | p.group.addChild(p.unit) | ||
| 2133 | p.unit = nil | ||
| 2134 | } | ||
| 2135 | return nil | ||
| 2136 | } | ||
| 2137 | |||
| 2138 | // True if the group stack is empty. | ||
| 2139 | func (p *parser) emptyStack() bool { | ||
| 2140 | return p.stack == nil | ||
| 2141 | } | ||
| 2142 | |||
| 2143 | // Start a new round for the parser state (in response to an open paren or string start) | ||
| 2144 | func (p *parser) startGroup(openGroup *regexNode) { | ||
| 2145 | p.group = openGroup | ||
| 2146 | p.alternation = newRegexNode(ntAlternate, p.options) | ||
| 2147 | p.concatenation = newRegexNode(ntConcatenate, p.options) | ||
| 2148 | } | ||
| 2149 | |||
| 2150 | // Finish the current concatenation (in response to a |) | ||
| 2151 | func (p *parser) addAlternate() { | ||
| 2152 | // The | parts inside a Testgroup group go directly to the group | ||
| 2153 | |||
| 2154 | if p.group.t == ntTestgroup || p.group.t == ntTestref { | ||
| 2155 | p.group.addChild(p.concatenation.reverseLeft()) | ||
| 2156 | } else { | ||
| 2157 | p.alternation.addChild(p.concatenation.reverseLeft()) | ||
| 2158 | } | ||
| 2159 | |||
| 2160 | p.concatenation = newRegexNode(ntConcatenate, p.options) | ||
| 2161 | } | ||
| 2162 | |||
| 2163 | // For categorizing ascii characters. | ||
| 2164 | |||
| 2165 | const ( | ||
| 2166 | Q byte = 5 // quantifier | ||
| 2167 | S = 4 // ordinary stopper | ||
| 2168 | Z = 3 // ScanBlank stopper | ||
| 2169 | X = 2 // whitespace | ||
| 2170 | E = 1 // should be escaped | ||
| 2171 | ) | ||
| 2172 | |||
| 2173 | var _category = []byte{ | ||
| 2174 | //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F | ||
| 2175 | 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 2176 | // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | ||
| 2177 | X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, | ||
| 2178 | //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ | ||
| 2179 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0, | ||
| 2180 | //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ | ||
| 2181 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0, | ||
| 2182 | } | ||
| 2183 | |||
| 2184 | func isSpace(ch rune) bool { | ||
| 2185 | return (ch <= ' ' && _category[ch] == X) | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | // Returns true for those characters that terminate a string of ordinary chars. | ||
| 2189 | func isSpecial(ch rune) bool { | ||
| 2190 | return (ch <= '|' && _category[ch] >= S) | ||
| 2191 | } | ||
| 2192 | |||
| 2193 | // Returns true for those characters that terminate a string of ordinary chars. | ||
| 2194 | func isStopperX(ch rune) bool { | ||
| 2195 | return (ch <= '|' && _category[ch] >= X) | ||
| 2196 | } | ||
| 2197 | |||
| 2198 | // Returns true for those characters that begin a quantifier. | ||
| 2199 | func isQuantifier(ch rune) bool { | ||
| 2200 | return (ch <= '{' && _category[ch] >= Q) | ||
| 2201 | } | ||
| 2202 | |||
| 2203 | func (p *parser) isTrueQuantifier() bool { | ||
| 2204 | nChars := p.charsRight() | ||
| 2205 | if nChars == 0 { | ||
| 2206 | return false | ||
| 2207 | } | ||
| 2208 | |||
| 2209 | startpos := p.textpos() | ||
| 2210 | ch := p.charAt(startpos) | ||
| 2211 | if ch != '{' { | ||
| 2212 | return ch <= '{' && _category[ch] >= Q | ||
| 2213 | } | ||
| 2214 | |||
| 2215 | //UGLY: this is ugly -- the original code was ugly too | ||
| 2216 | pos := startpos | ||
| 2217 | for { | ||
| 2218 | nChars-- | ||
| 2219 | if nChars <= 0 { | ||
| 2220 | break | ||
| 2221 | } | ||
| 2222 | pos++ | ||
| 2223 | ch = p.charAt(pos) | ||
| 2224 | if ch < '0' || ch > '9' { | ||
| 2225 | break | ||
| 2226 | } | ||
| 2227 | } | ||
| 2228 | |||
| 2229 | if nChars == 0 || pos-startpos == 1 { | ||
| 2230 | return false | ||
| 2231 | } | ||
| 2232 | if ch == '}' { | ||
| 2233 | return true | ||
| 2234 | } | ||
| 2235 | if ch != ',' { | ||
| 2236 | return false | ||
| 2237 | } | ||
| 2238 | for { | ||
| 2239 | nChars-- | ||
| 2240 | if nChars <= 0 { | ||
| 2241 | break | ||
| 2242 | } | ||
| 2243 | pos++ | ||
| 2244 | ch = p.charAt(pos) | ||
| 2245 | if ch < '0' || ch > '9' { | ||
| 2246 | break | ||
| 2247 | } | ||
| 2248 | } | ||
| 2249 | |||
| 2250 | return nChars > 0 && ch == '}' | ||
| 2251 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/prefix.go b/vendor/github.com/dlclark/regexp2/syntax/prefix.go new file mode 100644 index 0000000..f671688 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/prefix.go | |||
| @@ -0,0 +1,896 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "fmt" | ||
| 6 | "strconv" | ||
| 7 | "unicode" | ||
| 8 | "unicode/utf8" | ||
| 9 | ) | ||
| 10 | |||
| 11 | type Prefix struct { | ||
| 12 | PrefixStr []rune | ||
| 13 | PrefixSet CharSet | ||
| 14 | CaseInsensitive bool | ||
| 15 | } | ||
| 16 | |||
| 17 | // It takes a RegexTree and computes the set of chars that can start it. | ||
| 18 | func getFirstCharsPrefix(tree *RegexTree) *Prefix { | ||
| 19 | s := regexFcd{ | ||
| 20 | fcStack: make([]regexFc, 32), | ||
| 21 | intStack: make([]int, 32), | ||
| 22 | } | ||
| 23 | fc := s.regexFCFromRegexTree(tree) | ||
| 24 | |||
| 25 | if fc == nil || fc.nullable || fc.cc.IsEmpty() { | ||
| 26 | return nil | ||
| 27 | } | ||
| 28 | fcSet := fc.getFirstChars() | ||
| 29 | return &Prefix{PrefixSet: fcSet, CaseInsensitive: fc.caseInsensitive} | ||
| 30 | } | ||
| 31 | |||
| 32 | type regexFcd struct { | ||
| 33 | intStack []int | ||
| 34 | intDepth int | ||
| 35 | fcStack []regexFc | ||
| 36 | fcDepth int | ||
| 37 | skipAllChildren bool // don't process any more children at the current level | ||
| 38 | skipchild bool // don't process the current child. | ||
| 39 | failed bool | ||
| 40 | } | ||
| 41 | |||
| 42 | /* | ||
| 43 | * The main FC computation. It does a shortcutted depth-first walk | ||
| 44 | * through the tree and calls CalculateFC to emits code before | ||
| 45 | * and after each child of an interior node, and at each leaf. | ||
| 46 | */ | ||
| 47 | func (s *regexFcd) regexFCFromRegexTree(tree *RegexTree) *regexFc { | ||
| 48 | curNode := tree.root | ||
| 49 | curChild := 0 | ||
| 50 | |||
| 51 | for { | ||
| 52 | if len(curNode.children) == 0 { | ||
| 53 | // This is a leaf node | ||
| 54 | s.calculateFC(curNode.t, curNode, 0) | ||
| 55 | } else if curChild < len(curNode.children) && !s.skipAllChildren { | ||
| 56 | // This is an interior node, and we have more children to analyze | ||
| 57 | s.calculateFC(curNode.t|beforeChild, curNode, curChild) | ||
| 58 | |||
| 59 | if !s.skipchild { | ||
| 60 | curNode = curNode.children[curChild] | ||
| 61 | // this stack is how we get a depth first walk of the tree. | ||
| 62 | s.pushInt(curChild) | ||
| 63 | curChild = 0 | ||
| 64 | } else { | ||
| 65 | curChild++ | ||
| 66 | s.skipchild = false | ||
| 67 | } | ||
| 68 | continue | ||
| 69 | } | ||
| 70 | |||
| 71 | // This is an interior node where we've finished analyzing all the children, or | ||
| 72 | // the end of a leaf node. | ||
| 73 | s.skipAllChildren = false | ||
| 74 | |||
| 75 | if s.intIsEmpty() { | ||
| 76 | break | ||
| 77 | } | ||
| 78 | |||
| 79 | curChild = s.popInt() | ||
| 80 | curNode = curNode.next | ||
| 81 | |||
| 82 | s.calculateFC(curNode.t|afterChild, curNode, curChild) | ||
| 83 | if s.failed { | ||
| 84 | return nil | ||
| 85 | } | ||
| 86 | |||
| 87 | curChild++ | ||
| 88 | } | ||
| 89 | |||
| 90 | if s.fcIsEmpty() { | ||
| 91 | return nil | ||
| 92 | } | ||
| 93 | |||
| 94 | return s.popFC() | ||
| 95 | } | ||
| 96 | |||
| 97 | // To avoid recursion, we use a simple integer stack. | ||
| 98 | // This is the push. | ||
| 99 | func (s *regexFcd) pushInt(I int) { | ||
| 100 | if s.intDepth >= len(s.intStack) { | ||
| 101 | expanded := make([]int, s.intDepth*2) | ||
| 102 | copy(expanded, s.intStack) | ||
| 103 | s.intStack = expanded | ||
| 104 | } | ||
| 105 | |||
| 106 | s.intStack[s.intDepth] = I | ||
| 107 | s.intDepth++ | ||
| 108 | } | ||
| 109 | |||
| 110 | // True if the stack is empty. | ||
| 111 | func (s *regexFcd) intIsEmpty() bool { | ||
| 112 | return s.intDepth == 0 | ||
| 113 | } | ||
| 114 | |||
| 115 | // This is the pop. | ||
| 116 | func (s *regexFcd) popInt() int { | ||
| 117 | s.intDepth-- | ||
| 118 | return s.intStack[s.intDepth] | ||
| 119 | } | ||
| 120 | |||
| 121 | // We also use a stack of RegexFC objects. | ||
| 122 | // This is the push. | ||
| 123 | func (s *regexFcd) pushFC(fc regexFc) { | ||
| 124 | if s.fcDepth >= len(s.fcStack) { | ||
| 125 | expanded := make([]regexFc, s.fcDepth*2) | ||
| 126 | copy(expanded, s.fcStack) | ||
| 127 | s.fcStack = expanded | ||
| 128 | } | ||
| 129 | |||
| 130 | s.fcStack[s.fcDepth] = fc | ||
| 131 | s.fcDepth++ | ||
| 132 | } | ||
| 133 | |||
| 134 | // True if the stack is empty. | ||
| 135 | func (s *regexFcd) fcIsEmpty() bool { | ||
| 136 | return s.fcDepth == 0 | ||
| 137 | } | ||
| 138 | |||
| 139 | // This is the pop. | ||
| 140 | func (s *regexFcd) popFC() *regexFc { | ||
| 141 | s.fcDepth-- | ||
| 142 | return &s.fcStack[s.fcDepth] | ||
| 143 | } | ||
| 144 | |||
| 145 | // This is the top. | ||
| 146 | func (s *regexFcd) topFC() *regexFc { | ||
| 147 | return &s.fcStack[s.fcDepth-1] | ||
| 148 | } | ||
| 149 | |||
| 150 | // Called in Beforechild to prevent further processing of the current child | ||
| 151 | func (s *regexFcd) skipChild() { | ||
| 152 | s.skipchild = true | ||
| 153 | } | ||
| 154 | |||
| 155 | // FC computation and shortcut cases for each node type | ||
| 156 | func (s *regexFcd) calculateFC(nt nodeType, node *regexNode, CurIndex int) { | ||
| 157 | //fmt.Printf("NodeType: %v, CurIndex: %v, Desc: %v\n", nt, CurIndex, node.description()) | ||
| 158 | ci := false | ||
| 159 | rtl := false | ||
| 160 | |||
| 161 | if nt <= ntRef { | ||
| 162 | if (node.options & IgnoreCase) != 0 { | ||
| 163 | ci = true | ||
| 164 | } | ||
| 165 | if (node.options & RightToLeft) != 0 { | ||
| 166 | rtl = true | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | switch nt { | ||
| 171 | case ntConcatenate | beforeChild, ntAlternate | beforeChild, ntTestref | beforeChild, ntLoop | beforeChild, ntLazyloop | beforeChild: | ||
| 172 | break | ||
| 173 | |||
| 174 | case ntTestgroup | beforeChild: | ||
| 175 | if CurIndex == 0 { | ||
| 176 | s.skipChild() | ||
| 177 | } | ||
| 178 | break | ||
| 179 | |||
| 180 | case ntEmpty: | ||
| 181 | s.pushFC(regexFc{nullable: true}) | ||
| 182 | break | ||
| 183 | |||
| 184 | case ntConcatenate | afterChild: | ||
| 185 | if CurIndex != 0 { | ||
| 186 | child := s.popFC() | ||
| 187 | cumul := s.topFC() | ||
| 188 | |||
| 189 | s.failed = !cumul.addFC(*child, true) | ||
| 190 | } | ||
| 191 | |||
| 192 | fc := s.topFC() | ||
| 193 | if !fc.nullable { | ||
| 194 | s.skipAllChildren = true | ||
| 195 | } | ||
| 196 | break | ||
| 197 | |||
| 198 | case ntTestgroup | afterChild: | ||
| 199 | if CurIndex > 1 { | ||
| 200 | child := s.popFC() | ||
| 201 | cumul := s.topFC() | ||
| 202 | |||
| 203 | s.failed = !cumul.addFC(*child, false) | ||
| 204 | } | ||
| 205 | break | ||
| 206 | |||
| 207 | case ntAlternate | afterChild, ntTestref | afterChild: | ||
| 208 | if CurIndex != 0 { | ||
| 209 | child := s.popFC() | ||
| 210 | cumul := s.topFC() | ||
| 211 | |||
| 212 | s.failed = !cumul.addFC(*child, false) | ||
| 213 | } | ||
| 214 | break | ||
| 215 | |||
| 216 | case ntLoop | afterChild, ntLazyloop | afterChild: | ||
| 217 | if node.m == 0 { | ||
| 218 | fc := s.topFC() | ||
| 219 | fc.nullable = true | ||
| 220 | } | ||
| 221 | break | ||
| 222 | |||
| 223 | case ntGroup | beforeChild, ntGroup | afterChild, ntCapture | beforeChild, ntCapture | afterChild, ntGreedy | beforeChild, ntGreedy | afterChild: | ||
| 224 | break | ||
| 225 | |||
| 226 | case ntRequire | beforeChild, ntPrevent | beforeChild: | ||
| 227 | s.skipChild() | ||
| 228 | s.pushFC(regexFc{nullable: true}) | ||
| 229 | break | ||
| 230 | |||
| 231 | case ntRequire | afterChild, ntPrevent | afterChild: | ||
| 232 | break | ||
| 233 | |||
| 234 | case ntOne, ntNotone: | ||
| 235 | s.pushFC(newRegexFc(node.ch, nt == ntNotone, false, ci)) | ||
| 236 | break | ||
| 237 | |||
| 238 | case ntOneloop, ntOnelazy: | ||
| 239 | s.pushFC(newRegexFc(node.ch, false, node.m == 0, ci)) | ||
| 240 | break | ||
| 241 | |||
| 242 | case ntNotoneloop, ntNotonelazy: | ||
| 243 | s.pushFC(newRegexFc(node.ch, true, node.m == 0, ci)) | ||
| 244 | break | ||
| 245 | |||
| 246 | case ntMulti: | ||
| 247 | if len(node.str) == 0 { | ||
| 248 | s.pushFC(regexFc{nullable: true}) | ||
| 249 | } else if !rtl { | ||
| 250 | s.pushFC(newRegexFc(node.str[0], false, false, ci)) | ||
| 251 | } else { | ||
| 252 | s.pushFC(newRegexFc(node.str[len(node.str)-1], false, false, ci)) | ||
| 253 | } | ||
| 254 | break | ||
| 255 | |||
| 256 | case ntSet: | ||
| 257 | s.pushFC(regexFc{cc: node.set.Copy(), nullable: false, caseInsensitive: ci}) | ||
| 258 | break | ||
| 259 | |||
| 260 | case ntSetloop, ntSetlazy: | ||
| 261 | s.pushFC(regexFc{cc: node.set.Copy(), nullable: node.m == 0, caseInsensitive: ci}) | ||
| 262 | break | ||
| 263 | |||
| 264 | case ntRef: | ||
| 265 | s.pushFC(regexFc{cc: *AnyClass(), nullable: true, caseInsensitive: false}) | ||
| 266 | break | ||
| 267 | |||
| 268 | case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: | ||
| 269 | s.pushFC(regexFc{nullable: true}) | ||
| 270 | break | ||
| 271 | |||
| 272 | default: | ||
| 273 | panic(fmt.Sprintf("unexpected op code: %v", nt)) | ||
| 274 | } | ||
| 275 | } | ||
| 276 | |||
| 277 | type regexFc struct { | ||
| 278 | cc CharSet | ||
| 279 | nullable bool | ||
| 280 | caseInsensitive bool | ||
| 281 | } | ||
| 282 | |||
| 283 | func newRegexFc(ch rune, not, nullable, caseInsensitive bool) regexFc { | ||
| 284 | r := regexFc{ | ||
| 285 | caseInsensitive: caseInsensitive, | ||
| 286 | nullable: nullable, | ||
| 287 | } | ||
| 288 | if not { | ||
| 289 | if ch > 0 { | ||
| 290 | r.cc.addRange('\x00', ch-1) | ||
| 291 | } | ||
| 292 | if ch < 0xFFFF { | ||
| 293 | r.cc.addRange(ch+1, utf8.MaxRune) | ||
| 294 | } | ||
| 295 | } else { | ||
| 296 | r.cc.addRange(ch, ch) | ||
| 297 | } | ||
| 298 | return r | ||
| 299 | } | ||
| 300 | |||
| 301 | func (r *regexFc) getFirstChars() CharSet { | ||
| 302 | if r.caseInsensitive { | ||
| 303 | r.cc.addLowercase() | ||
| 304 | } | ||
| 305 | |||
| 306 | return r.cc | ||
| 307 | } | ||
| 308 | |||
| 309 | func (r *regexFc) addFC(fc regexFc, concatenate bool) bool { | ||
| 310 | if !r.cc.IsMergeable() || !fc.cc.IsMergeable() { | ||
| 311 | return false | ||
| 312 | } | ||
| 313 | |||
| 314 | if concatenate { | ||
| 315 | if !r.nullable { | ||
| 316 | return true | ||
| 317 | } | ||
| 318 | |||
| 319 | if !fc.nullable { | ||
| 320 | r.nullable = false | ||
| 321 | } | ||
| 322 | } else { | ||
| 323 | if fc.nullable { | ||
| 324 | r.nullable = true | ||
| 325 | } | ||
| 326 | } | ||
| 327 | |||
| 328 | r.caseInsensitive = r.caseInsensitive || fc.caseInsensitive | ||
| 329 | r.cc.addSet(fc.cc) | ||
| 330 | |||
| 331 | return true | ||
| 332 | } | ||
| 333 | |||
| 334 | // This is a related computation: it takes a RegexTree and computes the | ||
| 335 | // leading substring if it sees one. It's quite trivial and gives up easily. | ||
| 336 | func getPrefix(tree *RegexTree) *Prefix { | ||
| 337 | var concatNode *regexNode | ||
| 338 | nextChild := 0 | ||
| 339 | |||
| 340 | curNode := tree.root | ||
| 341 | |||
| 342 | for { | ||
| 343 | switch curNode.t { | ||
| 344 | case ntConcatenate: | ||
| 345 | if len(curNode.children) > 0 { | ||
| 346 | concatNode = curNode | ||
| 347 | nextChild = 0 | ||
| 348 | } | ||
| 349 | |||
| 350 | case ntGreedy, ntCapture: | ||
| 351 | curNode = curNode.children[0] | ||
| 352 | concatNode = nil | ||
| 353 | continue | ||
| 354 | |||
| 355 | case ntOneloop, ntOnelazy: | ||
| 356 | if curNode.m > 0 { | ||
| 357 | return &Prefix{ | ||
| 358 | PrefixStr: repeat(curNode.ch, curNode.m), | ||
| 359 | CaseInsensitive: (curNode.options & IgnoreCase) != 0, | ||
| 360 | } | ||
| 361 | } | ||
| 362 | return nil | ||
| 363 | |||
| 364 | case ntOne: | ||
| 365 | return &Prefix{ | ||
| 366 | PrefixStr: []rune{curNode.ch}, | ||
| 367 | CaseInsensitive: (curNode.options & IgnoreCase) != 0, | ||
| 368 | } | ||
| 369 | |||
| 370 | case ntMulti: | ||
| 371 | return &Prefix{ | ||
| 372 | PrefixStr: curNode.str, | ||
| 373 | CaseInsensitive: (curNode.options & IgnoreCase) != 0, | ||
| 374 | } | ||
| 375 | |||
| 376 | case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, ntStart, | ||
| 377 | ntEndZ, ntEnd, ntEmpty, ntRequire, ntPrevent: | ||
| 378 | |||
| 379 | default: | ||
| 380 | return nil | ||
| 381 | } | ||
| 382 | |||
| 383 | if concatNode == nil || nextChild >= len(concatNode.children) { | ||
| 384 | return nil | ||
| 385 | } | ||
| 386 | |||
| 387 | curNode = concatNode.children[nextChild] | ||
| 388 | nextChild++ | ||
| 389 | } | ||
| 390 | } | ||
| 391 | |||
| 392 | // repeat the rune r, c times... up to the max of MaxPrefixSize | ||
| 393 | func repeat(r rune, c int) []rune { | ||
| 394 | if c > MaxPrefixSize { | ||
| 395 | c = MaxPrefixSize | ||
| 396 | } | ||
| 397 | |||
| 398 | ret := make([]rune, c) | ||
| 399 | |||
| 400 | // binary growth using copy for speed | ||
| 401 | ret[0] = r | ||
| 402 | bp := 1 | ||
| 403 | for bp < len(ret) { | ||
| 404 | copy(ret[bp:], ret[:bp]) | ||
| 405 | bp *= 2 | ||
| 406 | } | ||
| 407 | |||
| 408 | return ret | ||
| 409 | } | ||
| 410 | |||
| 411 | // BmPrefix precomputes the Boyer-Moore | ||
| 412 | // tables for fast string scanning. These tables allow | ||
| 413 | // you to scan for the first occurrence of a string within | ||
| 414 | // a large body of text without examining every character. | ||
| 415 | // The performance of the heuristic depends on the actual | ||
| 416 | // string and the text being searched, but usually, the longer | ||
| 417 | // the string that is being searched for, the fewer characters | ||
| 418 | // need to be examined. | ||
| 419 | type BmPrefix struct { | ||
| 420 | positive []int | ||
| 421 | negativeASCII []int | ||
| 422 | negativeUnicode [][]int | ||
| 423 | pattern []rune | ||
| 424 | lowASCII rune | ||
| 425 | highASCII rune | ||
| 426 | rightToLeft bool | ||
| 427 | caseInsensitive bool | ||
| 428 | } | ||
| 429 | |||
| 430 | func newBmPrefix(pattern []rune, caseInsensitive, rightToLeft bool) *BmPrefix { | ||
| 431 | |||
| 432 | b := &BmPrefix{ | ||
| 433 | rightToLeft: rightToLeft, | ||
| 434 | caseInsensitive: caseInsensitive, | ||
| 435 | pattern: pattern, | ||
| 436 | } | ||
| 437 | |||
| 438 | if caseInsensitive { | ||
| 439 | for i := 0; i < len(b.pattern); i++ { | ||
| 440 | // We do the ToLower character by character for consistency. With surrogate chars, doing | ||
| 441 | // a ToLower on the entire string could actually change the surrogate pair. This is more correct | ||
| 442 | // linguistically, but since Regex doesn't support surrogates, it's more important to be | ||
| 443 | // consistent. | ||
| 444 | |||
| 445 | b.pattern[i] = unicode.ToLower(b.pattern[i]) | ||
| 446 | } | ||
| 447 | } | ||
| 448 | |||
| 449 | var beforefirst, last, bump int | ||
| 450 | var scan, match int | ||
| 451 | |||
| 452 | if !rightToLeft { | ||
| 453 | beforefirst = -1 | ||
| 454 | last = len(b.pattern) - 1 | ||
| 455 | bump = 1 | ||
| 456 | } else { | ||
| 457 | beforefirst = len(b.pattern) | ||
| 458 | last = 0 | ||
| 459 | bump = -1 | ||
| 460 | } | ||
| 461 | |||
| 462 | // PART I - the good-suffix shift table | ||
| 463 | // | ||
| 464 | // compute the positive requirement: | ||
| 465 | // if char "i" is the first one from the right that doesn't match, | ||
| 466 | // then we know the matcher can advance by _positive[i]. | ||
| 467 | // | ||
| 468 | // This algorithm is a simplified variant of the standard | ||
| 469 | // Boyer-Moore good suffix calculation. | ||
| 470 | |||
| 471 | b.positive = make([]int, len(b.pattern)) | ||
| 472 | |||
| 473 | examine := last | ||
| 474 | ch := b.pattern[examine] | ||
| 475 | b.positive[examine] = bump | ||
| 476 | examine -= bump | ||
| 477 | |||
| 478 | Outerloop: | ||
| 479 | for { | ||
| 480 | // find an internal char (examine) that matches the tail | ||
| 481 | |||
| 482 | for { | ||
| 483 | if examine == beforefirst { | ||
| 484 | break Outerloop | ||
| 485 | } | ||
| 486 | if b.pattern[examine] == ch { | ||
| 487 | break | ||
| 488 | } | ||
| 489 | examine -= bump | ||
| 490 | } | ||
| 491 | |||
| 492 | match = last | ||
| 493 | scan = examine | ||
| 494 | |||
| 495 | // find the length of the match | ||
| 496 | for { | ||
| 497 | if scan == beforefirst || b.pattern[match] != b.pattern[scan] { | ||
| 498 | // at the end of the match, note the difference in _positive | ||
| 499 | // this is not the length of the match, but the distance from the internal match | ||
| 500 | // to the tail suffix. | ||
| 501 | if b.positive[match] == 0 { | ||
| 502 | b.positive[match] = match - scan | ||
| 503 | } | ||
| 504 | |||
| 505 | // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); | ||
| 506 | |||
| 507 | break | ||
| 508 | } | ||
| 509 | |||
| 510 | scan -= bump | ||
| 511 | match -= bump | ||
| 512 | } | ||
| 513 | |||
| 514 | examine -= bump | ||
| 515 | } | ||
| 516 | |||
| 517 | match = last - bump | ||
| 518 | |||
| 519 | // scan for the chars for which there are no shifts that yield a different candidate | ||
| 520 | |||
| 521 | // The inside of the if statement used to say | ||
| 522 | // "_positive[match] = last - beforefirst;" | ||
| 523 | // This is slightly less aggressive in how much we skip, but at worst it | ||
| 524 | // should mean a little more work rather than skipping a potential match. | ||
| 525 | for match != beforefirst { | ||
| 526 | if b.positive[match] == 0 { | ||
| 527 | b.positive[match] = bump | ||
| 528 | } | ||
| 529 | |||
| 530 | match -= bump | ||
| 531 | } | ||
| 532 | |||
| 533 | // PART II - the bad-character shift table | ||
| 534 | // | ||
| 535 | // compute the negative requirement: | ||
| 536 | // if char "ch" is the reject character when testing position "i", | ||
| 537 | // we can slide up by _negative[ch]; | ||
| 538 | // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) | ||
| 539 | // | ||
| 540 | // the lookup table is divided into ASCII and Unicode portions; | ||
| 541 | // only those parts of the Unicode 16-bit code set that actually | ||
| 542 | // appear in the string are in the table. (Maximum size with | ||
| 543 | // Unicode is 65K; ASCII only case is 512 bytes.) | ||
| 544 | |||
| 545 | b.negativeASCII = make([]int, 128) | ||
| 546 | |||
| 547 | for i := 0; i < len(b.negativeASCII); i++ { | ||
| 548 | b.negativeASCII[i] = last - beforefirst | ||
| 549 | } | ||
| 550 | |||
| 551 | b.lowASCII = 127 | ||
| 552 | b.highASCII = 0 | ||
| 553 | |||
| 554 | for examine = last; examine != beforefirst; examine -= bump { | ||
| 555 | ch = b.pattern[examine] | ||
| 556 | |||
| 557 | switch { | ||
| 558 | case ch < 128: | ||
| 559 | if b.lowASCII > ch { | ||
| 560 | b.lowASCII = ch | ||
| 561 | } | ||
| 562 | |||
| 563 | if b.highASCII < ch { | ||
| 564 | b.highASCII = ch | ||
| 565 | } | ||
| 566 | |||
| 567 | if b.negativeASCII[ch] == last-beforefirst { | ||
| 568 | b.negativeASCII[ch] = last - examine | ||
| 569 | } | ||
| 570 | case ch <= 0xffff: | ||
| 571 | i, j := ch>>8, ch&0xFF | ||
| 572 | |||
| 573 | if b.negativeUnicode == nil { | ||
| 574 | b.negativeUnicode = make([][]int, 256) | ||
| 575 | } | ||
| 576 | |||
| 577 | if b.negativeUnicode[i] == nil { | ||
| 578 | newarray := make([]int, 256) | ||
| 579 | |||
| 580 | for k := 0; k < len(newarray); k++ { | ||
| 581 | newarray[k] = last - beforefirst | ||
| 582 | } | ||
| 583 | |||
| 584 | if i == 0 { | ||
| 585 | copy(newarray, b.negativeASCII) | ||
| 586 | //TODO: this line needed? | ||
| 587 | b.negativeASCII = newarray | ||
| 588 | } | ||
| 589 | |||
| 590 | b.negativeUnicode[i] = newarray | ||
| 591 | } | ||
| 592 | |||
| 593 | if b.negativeUnicode[i][j] == last-beforefirst { | ||
| 594 | b.negativeUnicode[i][j] = last - examine | ||
| 595 | } | ||
| 596 | default: | ||
| 597 | // we can't do the filter because this algo doesn't support | ||
| 598 | // unicode chars >0xffff | ||
| 599 | return nil | ||
| 600 | } | ||
| 601 | } | ||
| 602 | |||
| 603 | return b | ||
| 604 | } | ||
| 605 | |||
| 606 | func (b *BmPrefix) String() string { | ||
| 607 | return string(b.pattern) | ||
| 608 | } | ||
| 609 | |||
| 610 | // Dump returns the contents of the filter as a human readable string | ||
| 611 | func (b *BmPrefix) Dump(indent string) string { | ||
| 612 | buf := &bytes.Buffer{} | ||
| 613 | |||
| 614 | fmt.Fprintf(buf, "%sBM Pattern: %s\n%sPositive: ", indent, string(b.pattern), indent) | ||
| 615 | for i := 0; i < len(b.positive); i++ { | ||
| 616 | buf.WriteString(strconv.Itoa(b.positive[i])) | ||
| 617 | buf.WriteRune(' ') | ||
| 618 | } | ||
| 619 | buf.WriteRune('\n') | ||
| 620 | |||
| 621 | if b.negativeASCII != nil { | ||
| 622 | buf.WriteString(indent) | ||
| 623 | buf.WriteString("Negative table\n") | ||
| 624 | for i := 0; i < len(b.negativeASCII); i++ { | ||
| 625 | if b.negativeASCII[i] != len(b.pattern) { | ||
| 626 | fmt.Fprintf(buf, "%s %s %s\n", indent, Escape(string(rune(i))), strconv.Itoa(b.negativeASCII[i])) | ||
| 627 | } | ||
| 628 | } | ||
| 629 | } | ||
| 630 | |||
| 631 | return buf.String() | ||
| 632 | } | ||
| 633 | |||
| 634 | // Scan uses the Boyer-Moore algorithm to find the first occurrence | ||
| 635 | // of the specified string within text, beginning at index, and | ||
| 636 | // constrained within beglimit and endlimit. | ||
| 637 | // | ||
| 638 | // The direction and case-sensitivity of the match is determined | ||
| 639 | // by the arguments to the RegexBoyerMoore constructor. | ||
| 640 | func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int { | ||
| 641 | var ( | ||
| 642 | defadv, test, test2 int | ||
| 643 | match, startmatch, endmatch int | ||
| 644 | bump, advance int | ||
| 645 | chTest rune | ||
| 646 | unicodeLookup []int | ||
| 647 | ) | ||
| 648 | |||
| 649 | if !b.rightToLeft { | ||
| 650 | defadv = len(b.pattern) | ||
| 651 | startmatch = len(b.pattern) - 1 | ||
| 652 | endmatch = 0 | ||
| 653 | test = index + defadv - 1 | ||
| 654 | bump = 1 | ||
| 655 | } else { | ||
| 656 | defadv = -len(b.pattern) | ||
| 657 | startmatch = 0 | ||
| 658 | endmatch = -defadv - 1 | ||
| 659 | test = index + defadv | ||
| 660 | bump = -1 | ||
| 661 | } | ||
| 662 | |||
| 663 | chMatch := b.pattern[startmatch] | ||
| 664 | |||
| 665 | for { | ||
| 666 | if test >= endlimit || test < beglimit { | ||
| 667 | return -1 | ||
| 668 | } | ||
| 669 | |||
| 670 | chTest = text[test] | ||
| 671 | |||
| 672 | if b.caseInsensitive { | ||
| 673 | chTest = unicode.ToLower(chTest) | ||
| 674 | } | ||
| 675 | |||
| 676 | if chTest != chMatch { | ||
| 677 | if chTest < 128 { | ||
| 678 | advance = b.negativeASCII[chTest] | ||
| 679 | } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { | ||
| 680 | unicodeLookup = b.negativeUnicode[chTest>>8] | ||
| 681 | if len(unicodeLookup) > 0 { | ||
| 682 | advance = unicodeLookup[chTest&0xFF] | ||
| 683 | } else { | ||
| 684 | advance = defadv | ||
| 685 | } | ||
| 686 | } else { | ||
| 687 | advance = defadv | ||
| 688 | } | ||
| 689 | |||
| 690 | test += advance | ||
| 691 | } else { // if (chTest == chMatch) | ||
| 692 | test2 = test | ||
| 693 | match = startmatch | ||
| 694 | |||
| 695 | for { | ||
| 696 | if match == endmatch { | ||
| 697 | if b.rightToLeft { | ||
| 698 | return test2 + 1 | ||
| 699 | } else { | ||
| 700 | return test2 | ||
| 701 | } | ||
| 702 | } | ||
| 703 | |||
| 704 | match -= bump | ||
| 705 | test2 -= bump | ||
| 706 | |||
| 707 | chTest = text[test2] | ||
| 708 | |||
| 709 | if b.caseInsensitive { | ||
| 710 | chTest = unicode.ToLower(chTest) | ||
| 711 | } | ||
| 712 | |||
| 713 | if chTest != b.pattern[match] { | ||
| 714 | advance = b.positive[match] | ||
| 715 | if chTest < 128 { | ||
| 716 | test2 = (match - startmatch) + b.negativeASCII[chTest] | ||
| 717 | } else if chTest < 0xffff && len(b.negativeUnicode) > 0 { | ||
| 718 | unicodeLookup = b.negativeUnicode[chTest>>8] | ||
| 719 | if len(unicodeLookup) > 0 { | ||
| 720 | test2 = (match - startmatch) + unicodeLookup[chTest&0xFF] | ||
| 721 | } else { | ||
| 722 | test += advance | ||
| 723 | break | ||
| 724 | } | ||
| 725 | } else { | ||
| 726 | test += advance | ||
| 727 | break | ||
| 728 | } | ||
| 729 | |||
| 730 | if b.rightToLeft { | ||
| 731 | if test2 < advance { | ||
| 732 | advance = test2 | ||
| 733 | } | ||
| 734 | } else if test2 > advance { | ||
| 735 | advance = test2 | ||
| 736 | } | ||
| 737 | |||
| 738 | test += advance | ||
| 739 | break | ||
| 740 | } | ||
| 741 | } | ||
| 742 | } | ||
| 743 | } | ||
| 744 | } | ||
| 745 | |||
| 746 | // When a regex is anchored, we can do a quick IsMatch test instead of a Scan | ||
| 747 | func (b *BmPrefix) IsMatch(text []rune, index, beglimit, endlimit int) bool { | ||
| 748 | if !b.rightToLeft { | ||
| 749 | if index < beglimit || endlimit-index < len(b.pattern) { | ||
| 750 | return false | ||
| 751 | } | ||
| 752 | |||
| 753 | return b.matchPattern(text, index) | ||
| 754 | } else { | ||
| 755 | if index > endlimit || index-beglimit < len(b.pattern) { | ||
| 756 | return false | ||
| 757 | } | ||
| 758 | |||
| 759 | return b.matchPattern(text, index-len(b.pattern)) | ||
| 760 | } | ||
| 761 | } | ||
| 762 | |||
| 763 | func (b *BmPrefix) matchPattern(text []rune, index int) bool { | ||
| 764 | if len(text)-index < len(b.pattern) { | ||
| 765 | return false | ||
| 766 | } | ||
| 767 | |||
| 768 | if b.caseInsensitive { | ||
| 769 | for i := 0; i < len(b.pattern); i++ { | ||
| 770 | //Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); | ||
| 771 | if unicode.ToLower(text[index+i]) != b.pattern[i] { | ||
| 772 | return false | ||
| 773 | } | ||
| 774 | } | ||
| 775 | return true | ||
| 776 | } else { | ||
| 777 | for i := 0; i < len(b.pattern); i++ { | ||
| 778 | if text[index+i] != b.pattern[i] { | ||
| 779 | return false | ||
| 780 | } | ||
| 781 | } | ||
| 782 | return true | ||
| 783 | } | ||
| 784 | } | ||
| 785 | |||
| 786 | type AnchorLoc int16 | ||
| 787 | |||
| 788 | // where the regex can be pegged | ||
| 789 | const ( | ||
| 790 | AnchorBeginning AnchorLoc = 0x0001 | ||
| 791 | AnchorBol = 0x0002 | ||
| 792 | AnchorStart = 0x0004 | ||
| 793 | AnchorEol = 0x0008 | ||
| 794 | AnchorEndZ = 0x0010 | ||
| 795 | AnchorEnd = 0x0020 | ||
| 796 | AnchorBoundary = 0x0040 | ||
| 797 | AnchorECMABoundary = 0x0080 | ||
| 798 | ) | ||
| 799 | |||
| 800 | func getAnchors(tree *RegexTree) AnchorLoc { | ||
| 801 | |||
| 802 | var concatNode *regexNode | ||
| 803 | nextChild, result := 0, AnchorLoc(0) | ||
| 804 | |||
| 805 | curNode := tree.root | ||
| 806 | |||
| 807 | for { | ||
| 808 | switch curNode.t { | ||
| 809 | case ntConcatenate: | ||
| 810 | if len(curNode.children) > 0 { | ||
| 811 | concatNode = curNode | ||
| 812 | nextChild = 0 | ||
| 813 | } | ||
| 814 | |||
| 815 | case ntGreedy, ntCapture: | ||
| 816 | curNode = curNode.children[0] | ||
| 817 | concatNode = nil | ||
| 818 | continue | ||
| 819 | |||
| 820 | case ntBol, ntEol, ntBoundary, ntECMABoundary, ntBeginning, | ||
| 821 | ntStart, ntEndZ, ntEnd: | ||
| 822 | return result | anchorFromType(curNode.t) | ||
| 823 | |||
| 824 | case ntEmpty, ntRequire, ntPrevent: | ||
| 825 | |||
| 826 | default: | ||
| 827 | return result | ||
| 828 | } | ||
| 829 | |||
| 830 | if concatNode == nil || nextChild >= len(concatNode.children) { | ||
| 831 | return result | ||
| 832 | } | ||
| 833 | |||
| 834 | curNode = concatNode.children[nextChild] | ||
| 835 | nextChild++ | ||
| 836 | } | ||
| 837 | } | ||
| 838 | |||
| 839 | func anchorFromType(t nodeType) AnchorLoc { | ||
| 840 | switch t { | ||
| 841 | case ntBol: | ||
| 842 | return AnchorBol | ||
| 843 | case ntEol: | ||
| 844 | return AnchorEol | ||
| 845 | case ntBoundary: | ||
| 846 | return AnchorBoundary | ||
| 847 | case ntECMABoundary: | ||
| 848 | return AnchorECMABoundary | ||
| 849 | case ntBeginning: | ||
| 850 | return AnchorBeginning | ||
| 851 | case ntStart: | ||
| 852 | return AnchorStart | ||
| 853 | case ntEndZ: | ||
| 854 | return AnchorEndZ | ||
| 855 | case ntEnd: | ||
| 856 | return AnchorEnd | ||
| 857 | default: | ||
| 858 | return 0 | ||
| 859 | } | ||
| 860 | } | ||
| 861 | |||
| 862 | // anchorDescription returns a human-readable description of the anchors | ||
| 863 | func (anchors AnchorLoc) String() string { | ||
| 864 | buf := &bytes.Buffer{} | ||
| 865 | |||
| 866 | if 0 != (anchors & AnchorBeginning) { | ||
| 867 | buf.WriteString(", Beginning") | ||
| 868 | } | ||
| 869 | if 0 != (anchors & AnchorStart) { | ||
| 870 | buf.WriteString(", Start") | ||
| 871 | } | ||
| 872 | if 0 != (anchors & AnchorBol) { | ||
| 873 | buf.WriteString(", Bol") | ||
| 874 | } | ||
| 875 | if 0 != (anchors & AnchorBoundary) { | ||
| 876 | buf.WriteString(", Boundary") | ||
| 877 | } | ||
| 878 | if 0 != (anchors & AnchorECMABoundary) { | ||
| 879 | buf.WriteString(", ECMABoundary") | ||
| 880 | } | ||
| 881 | if 0 != (anchors & AnchorEol) { | ||
| 882 | buf.WriteString(", Eol") | ||
| 883 | } | ||
| 884 | if 0 != (anchors & AnchorEnd) { | ||
| 885 | buf.WriteString(", End") | ||
| 886 | } | ||
| 887 | if 0 != (anchors & AnchorEndZ) { | ||
| 888 | buf.WriteString(", EndZ") | ||
| 889 | } | ||
| 890 | |||
| 891 | // trim off comma | ||
| 892 | if buf.Len() >= 2 { | ||
| 893 | return buf.String()[2:] | ||
| 894 | } | ||
| 895 | return "None" | ||
| 896 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go new file mode 100644 index 0000000..bcf4d3f --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/replacerdata.go | |||
| @@ -0,0 +1,87 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "errors" | ||
| 6 | ) | ||
| 7 | |||
| 8 | type ReplacerData struct { | ||
| 9 | Rep string | ||
| 10 | Strings []string | ||
| 11 | Rules []int | ||
| 12 | } | ||
| 13 | |||
| 14 | const ( | ||
| 15 | replaceSpecials = 4 | ||
| 16 | replaceLeftPortion = -1 | ||
| 17 | replaceRightPortion = -2 | ||
| 18 | replaceLastGroup = -3 | ||
| 19 | replaceWholeString = -4 | ||
| 20 | ) | ||
| 21 | |||
| 22 | //ErrReplacementError is a general error during parsing the replacement text | ||
| 23 | var ErrReplacementError = errors.New("Replacement pattern error.") | ||
| 24 | |||
| 25 | // NewReplacerData will populate a reusable replacer data struct based on the given replacement string | ||
| 26 | // and the capture group data from a regexp | ||
| 27 | func NewReplacerData(rep string, caps map[int]int, capsize int, capnames map[string]int, op RegexOptions) (*ReplacerData, error) { | ||
| 28 | p := parser{ | ||
| 29 | options: op, | ||
| 30 | caps: caps, | ||
| 31 | capsize: capsize, | ||
| 32 | capnames: capnames, | ||
| 33 | } | ||
| 34 | p.setPattern(rep) | ||
| 35 | concat, err := p.scanReplacement() | ||
| 36 | if err != nil { | ||
| 37 | return nil, err | ||
| 38 | } | ||
| 39 | |||
| 40 | if concat.t != ntConcatenate { | ||
| 41 | panic(ErrReplacementError) | ||
| 42 | } | ||
| 43 | |||
| 44 | sb := &bytes.Buffer{} | ||
| 45 | var ( | ||
| 46 | strings []string | ||
| 47 | rules []int | ||
| 48 | ) | ||
| 49 | |||
| 50 | for _, child := range concat.children { | ||
| 51 | switch child.t { | ||
| 52 | case ntMulti: | ||
| 53 | child.writeStrToBuf(sb) | ||
| 54 | |||
| 55 | case ntOne: | ||
| 56 | sb.WriteRune(child.ch) | ||
| 57 | |||
| 58 | case ntRef: | ||
| 59 | if sb.Len() > 0 { | ||
| 60 | rules = append(rules, len(strings)) | ||
| 61 | strings = append(strings, sb.String()) | ||
| 62 | sb.Reset() | ||
| 63 | } | ||
| 64 | slot := child.m | ||
| 65 | |||
| 66 | if len(caps) > 0 && slot >= 0 { | ||
| 67 | slot = caps[slot] | ||
| 68 | } | ||
| 69 | |||
| 70 | rules = append(rules, -replaceSpecials-1-slot) | ||
| 71 | |||
| 72 | default: | ||
| 73 | panic(ErrReplacementError) | ||
| 74 | } | ||
| 75 | } | ||
| 76 | |||
| 77 | if sb.Len() > 0 { | ||
| 78 | rules = append(rules, len(strings)) | ||
| 79 | strings = append(strings, sb.String()) | ||
| 80 | } | ||
| 81 | |||
| 82 | return &ReplacerData{ | ||
| 83 | Rep: rep, | ||
| 84 | Strings: strings, | ||
| 85 | Rules: rules, | ||
| 86 | }, nil | ||
| 87 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/tree.go b/vendor/github.com/dlclark/regexp2/syntax/tree.go new file mode 100644 index 0000000..ea28829 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/tree.go | |||
| @@ -0,0 +1,654 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "fmt" | ||
| 6 | "math" | ||
| 7 | "strconv" | ||
| 8 | ) | ||
| 9 | |||
| 10 | type RegexTree struct { | ||
| 11 | root *regexNode | ||
| 12 | caps map[int]int | ||
| 13 | capnumlist []int | ||
| 14 | captop int | ||
| 15 | Capnames map[string]int | ||
| 16 | Caplist []string | ||
| 17 | options RegexOptions | ||
| 18 | } | ||
| 19 | |||
| 20 | // It is built into a parsed tree for a regular expression. | ||
| 21 | |||
| 22 | // Implementation notes: | ||
| 23 | // | ||
| 24 | // Since the node tree is a temporary data structure only used | ||
| 25 | // during compilation of the regexp to integer codes, it's | ||
| 26 | // designed for clarity and convenience rather than | ||
| 27 | // space efficiency. | ||
| 28 | // | ||
| 29 | // RegexNodes are built into a tree, linked by the n.children list. | ||
| 30 | // Each node also has a n.parent and n.ichild member indicating | ||
| 31 | // its parent and which child # it is in its parent's list. | ||
| 32 | // | ||
| 33 | // RegexNodes come in as many types as there are constructs in | ||
| 34 | // a regular expression, for example, "concatenate", "alternate", | ||
| 35 | // "one", "rept", "group". There are also node types for basic | ||
| 36 | // peephole optimizations, e.g., "onerep", "notsetrep", etc. | ||
| 37 | // | ||
| 38 | // Because perl 5 allows "lookback" groups that scan backwards, | ||
| 39 | // each node also gets a "direction". Normally the value of | ||
| 40 | // boolean n.backward = false. | ||
| 41 | // | ||
| 42 | // During parsing, top-level nodes are also stacked onto a parse | ||
| 43 | // stack (a stack of trees). For this purpose we have a n.next | ||
| 44 | // pointer. [Note that to save a few bytes, we could overload the | ||
| 45 | // n.parent pointer instead.] | ||
| 46 | // | ||
| 47 | // On the parse stack, each tree has a "role" - basically, the | ||
| 48 | // nonterminal in the grammar that the parser has currently | ||
| 49 | // assigned to the tree. That code is stored in n.role. | ||
| 50 | // | ||
| 51 | // Finally, some of the different kinds of nodes have data. | ||
| 52 | // Two integers (for the looping constructs) are stored in | ||
| 53 | // n.operands, an an object (either a string or a set) | ||
| 54 | // is stored in n.data | ||
| 55 | type regexNode struct { | ||
| 56 | t nodeType | ||
| 57 | children []*regexNode | ||
| 58 | str []rune | ||
| 59 | set *CharSet | ||
| 60 | ch rune | ||
| 61 | m int | ||
| 62 | n int | ||
| 63 | options RegexOptions | ||
| 64 | next *regexNode | ||
| 65 | } | ||
| 66 | |||
| 67 | type nodeType int32 | ||
| 68 | |||
| 69 | const ( | ||
| 70 | // The following are leaves, and correspond to primitive operations | ||
| 71 | |||
| 72 | ntOnerep nodeType = 0 // lef,back char,min,max a {n} | ||
| 73 | ntNotonerep = 1 // lef,back char,min,max .{n} | ||
| 74 | ntSetrep = 2 // lef,back set,min,max [\d]{n} | ||
| 75 | ntOneloop = 3 // lef,back char,min,max a {,n} | ||
| 76 | ntNotoneloop = 4 // lef,back char,min,max .{,n} | ||
| 77 | ntSetloop = 5 // lef,back set,min,max [\d]{,n} | ||
| 78 | ntOnelazy = 6 // lef,back char,min,max a {,n}? | ||
| 79 | ntNotonelazy = 7 // lef,back char,min,max .{,n}? | ||
| 80 | ntSetlazy = 8 // lef,back set,min,max [\d]{,n}? | ||
| 81 | ntOne = 9 // lef char a | ||
| 82 | ntNotone = 10 // lef char [^a] | ||
| 83 | ntSet = 11 // lef set [a-z\s] \w \s \d | ||
| 84 | ntMulti = 12 // lef string abcd | ||
| 85 | ntRef = 13 // lef group \# | ||
| 86 | ntBol = 14 // ^ | ||
| 87 | ntEol = 15 // $ | ||
| 88 | ntBoundary = 16 // \b | ||
| 89 | ntNonboundary = 17 // \B | ||
| 90 | ntBeginning = 18 // \A | ||
| 91 | ntStart = 19 // \G | ||
| 92 | ntEndZ = 20 // \Z | ||
| 93 | ntEnd = 21 // \Z | ||
| 94 | |||
| 95 | // Interior nodes do not correspond to primitive operations, but | ||
| 96 | // control structures compositing other operations | ||
| 97 | |||
| 98 | // Concat and alternate take n children, and can run forward or backwards | ||
| 99 | |||
| 100 | ntNothing = 22 // [] | ||
| 101 | ntEmpty = 23 // () | ||
| 102 | ntAlternate = 24 // a|b | ||
| 103 | ntConcatenate = 25 // ab | ||
| 104 | ntLoop = 26 // m,x * + ? {,} | ||
| 105 | ntLazyloop = 27 // m,x *? +? ?? {,}? | ||
| 106 | ntCapture = 28 // n () | ||
| 107 | ntGroup = 29 // (?:) | ||
| 108 | ntRequire = 30 // (?=) (?<=) | ||
| 109 | ntPrevent = 31 // (?!) (?<!) | ||
| 110 | ntGreedy = 32 // (?>) (?<) | ||
| 111 | ntTestref = 33 // (?(n) | ) | ||
| 112 | ntTestgroup = 34 // (?(...) | ) | ||
| 113 | |||
| 114 | ntECMABoundary = 41 // \b | ||
| 115 | ntNonECMABoundary = 42 // \B | ||
| 116 | ) | ||
| 117 | |||
| 118 | func newRegexNode(t nodeType, opt RegexOptions) *regexNode { | ||
| 119 | return ®exNode{ | ||
| 120 | t: t, | ||
| 121 | options: opt, | ||
| 122 | } | ||
| 123 | } | ||
| 124 | |||
| 125 | func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode { | ||
| 126 | return ®exNode{ | ||
| 127 | t: t, | ||
| 128 | options: opt, | ||
| 129 | ch: ch, | ||
| 130 | } | ||
| 131 | } | ||
| 132 | |||
| 133 | func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode { | ||
| 134 | return ®exNode{ | ||
| 135 | t: t, | ||
| 136 | options: opt, | ||
| 137 | str: str, | ||
| 138 | } | ||
| 139 | } | ||
| 140 | |||
| 141 | func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode { | ||
| 142 | return ®exNode{ | ||
| 143 | t: t, | ||
| 144 | options: opt, | ||
| 145 | set: set, | ||
| 146 | } | ||
| 147 | } | ||
| 148 | |||
| 149 | func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode { | ||
| 150 | return ®exNode{ | ||
| 151 | t: t, | ||
| 152 | options: opt, | ||
| 153 | m: m, | ||
| 154 | } | ||
| 155 | } | ||
| 156 | func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode { | ||
| 157 | return ®exNode{ | ||
| 158 | t: t, | ||
| 159 | options: opt, | ||
| 160 | m: m, | ||
| 161 | n: n, | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 | func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) { | ||
| 166 | for i := 0; i < len(n.str); i++ { | ||
| 167 | buf.WriteRune(n.str[i]) | ||
| 168 | } | ||
| 169 | } | ||
| 170 | |||
| 171 | func (n *regexNode) addChild(child *regexNode) { | ||
| 172 | reduced := child.reduce() | ||
| 173 | n.children = append(n.children, reduced) | ||
| 174 | reduced.next = n | ||
| 175 | } | ||
| 176 | |||
| 177 | func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) { | ||
| 178 | newChildren := make([]*regexNode, 0, len(n.children)+len(nodes)) | ||
| 179 | n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...) | ||
| 180 | } | ||
| 181 | |||
| 182 | // removes children including the start but not the end index | ||
| 183 | func (n *regexNode) removeChildren(startIndex, endIndex int) { | ||
| 184 | n.children = append(n.children[:startIndex], n.children[endIndex:]...) | ||
| 185 | } | ||
| 186 | |||
| 187 | // Pass type as OneLazy or OneLoop | ||
| 188 | func (n *regexNode) makeRep(t nodeType, min, max int) { | ||
| 189 | n.t += (t - ntOne) | ||
| 190 | n.m = min | ||
| 191 | n.n = max | ||
| 192 | } | ||
| 193 | |||
| 194 | func (n *regexNode) reduce() *regexNode { | ||
| 195 | switch n.t { | ||
| 196 | case ntAlternate: | ||
| 197 | return n.reduceAlternation() | ||
| 198 | |||
| 199 | case ntConcatenate: | ||
| 200 | return n.reduceConcatenation() | ||
| 201 | |||
| 202 | case ntLoop, ntLazyloop: | ||
| 203 | return n.reduceRep() | ||
| 204 | |||
| 205 | case ntGroup: | ||
| 206 | return n.reduceGroup() | ||
| 207 | |||
| 208 | case ntSet, ntSetloop: | ||
| 209 | return n.reduceSet() | ||
| 210 | |||
| 211 | default: | ||
| 212 | return n | ||
| 213 | } | ||
| 214 | } | ||
| 215 | |||
| 216 | // Basic optimization. Single-letter alternations can be replaced | ||
| 217 | // by faster set specifications, and nested alternations with no | ||
| 218 | // intervening operators can be flattened: | ||
| 219 | // | ||
| 220 | // a|b|c|def|g|h -> [a-c]|def|[gh] | ||
| 221 | // apple|(?:orange|pear)|grape -> apple|orange|pear|grape | ||
| 222 | func (n *regexNode) reduceAlternation() *regexNode { | ||
| 223 | if len(n.children) == 0 { | ||
| 224 | return newRegexNode(ntNothing, n.options) | ||
| 225 | } | ||
| 226 | |||
| 227 | wasLastSet := false | ||
| 228 | lastNodeCannotMerge := false | ||
| 229 | var optionsLast RegexOptions | ||
| 230 | var i, j int | ||
| 231 | |||
| 232 | for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { | ||
| 233 | at := n.children[i] | ||
| 234 | |||
| 235 | if j < i { | ||
| 236 | n.children[j] = at | ||
| 237 | } | ||
| 238 | |||
| 239 | for { | ||
| 240 | if at.t == ntAlternate { | ||
| 241 | for k := 0; k < len(at.children); k++ { | ||
| 242 | at.children[k].next = n | ||
| 243 | } | ||
| 244 | n.insertChildren(i+1, at.children) | ||
| 245 | |||
| 246 | j-- | ||
| 247 | } else if at.t == ntSet || at.t == ntOne { | ||
| 248 | // Cannot merge sets if L or I options differ, or if either are negated. | ||
| 249 | optionsAt := at.options & (RightToLeft | IgnoreCase) | ||
| 250 | |||
| 251 | if at.t == ntSet { | ||
| 252 | if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() { | ||
| 253 | wasLastSet = true | ||
| 254 | lastNodeCannotMerge = !at.set.IsMergeable() | ||
| 255 | optionsLast = optionsAt | ||
| 256 | break | ||
| 257 | } | ||
| 258 | } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge { | ||
| 259 | wasLastSet = true | ||
| 260 | lastNodeCannotMerge = false | ||
| 261 | optionsLast = optionsAt | ||
| 262 | break | ||
| 263 | } | ||
| 264 | |||
| 265 | // The last node was a Set or a One, we're a Set or One and our options are the same. | ||
| 266 | // Merge the two nodes. | ||
| 267 | j-- | ||
| 268 | prev := n.children[j] | ||
| 269 | |||
| 270 | var prevCharClass *CharSet | ||
| 271 | if prev.t == ntOne { | ||
| 272 | prevCharClass = &CharSet{} | ||
| 273 | prevCharClass.addChar(prev.ch) | ||
| 274 | } else { | ||
| 275 | prevCharClass = prev.set | ||
| 276 | } | ||
| 277 | |||
| 278 | if at.t == ntOne { | ||
| 279 | prevCharClass.addChar(at.ch) | ||
| 280 | } else { | ||
| 281 | prevCharClass.addSet(*at.set) | ||
| 282 | } | ||
| 283 | |||
| 284 | prev.t = ntSet | ||
| 285 | prev.set = prevCharClass | ||
| 286 | } else if at.t == ntNothing { | ||
| 287 | j-- | ||
| 288 | } else { | ||
| 289 | wasLastSet = false | ||
| 290 | lastNodeCannotMerge = false | ||
| 291 | } | ||
| 292 | break | ||
| 293 | } | ||
| 294 | } | ||
| 295 | |||
| 296 | if j < i { | ||
| 297 | n.removeChildren(j, i) | ||
| 298 | } | ||
| 299 | |||
| 300 | return n.stripEnation(ntNothing) | ||
| 301 | } | ||
| 302 | |||
| 303 | // Basic optimization. Adjacent strings can be concatenated. | ||
| 304 | // | ||
| 305 | // (?:abc)(?:def) -> abcdef | ||
| 306 | func (n *regexNode) reduceConcatenation() *regexNode { | ||
| 307 | // Eliminate empties and concat adjacent strings/chars | ||
| 308 | |||
| 309 | var optionsLast RegexOptions | ||
| 310 | var optionsAt RegexOptions | ||
| 311 | var i, j int | ||
| 312 | |||
| 313 | if len(n.children) == 0 { | ||
| 314 | return newRegexNode(ntEmpty, n.options) | ||
| 315 | } | ||
| 316 | |||
| 317 | wasLastString := false | ||
| 318 | |||
| 319 | for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { | ||
| 320 | var at, prev *regexNode | ||
| 321 | |||
| 322 | at = n.children[i] | ||
| 323 | |||
| 324 | if j < i { | ||
| 325 | n.children[j] = at | ||
| 326 | } | ||
| 327 | |||
| 328 | if at.t == ntConcatenate && | ||
| 329 | ((at.options & RightToLeft) == (n.options & RightToLeft)) { | ||
| 330 | for k := 0; k < len(at.children); k++ { | ||
| 331 | at.children[k].next = n | ||
| 332 | } | ||
| 333 | |||
| 334 | //insert at.children at i+1 index in n.children | ||
| 335 | n.insertChildren(i+1, at.children) | ||
| 336 | |||
| 337 | j-- | ||
| 338 | } else if at.t == ntMulti || at.t == ntOne { | ||
| 339 | // Cannot merge strings if L or I options differ | ||
| 340 | optionsAt = at.options & (RightToLeft | IgnoreCase) | ||
| 341 | |||
| 342 | if !wasLastString || optionsLast != optionsAt { | ||
| 343 | wasLastString = true | ||
| 344 | optionsLast = optionsAt | ||
| 345 | continue | ||
| 346 | } | ||
| 347 | |||
| 348 | j-- | ||
| 349 | prev = n.children[j] | ||
| 350 | |||
| 351 | if prev.t == ntOne { | ||
| 352 | prev.t = ntMulti | ||
| 353 | prev.str = []rune{prev.ch} | ||
| 354 | } | ||
| 355 | |||
| 356 | if (optionsAt & RightToLeft) == 0 { | ||
| 357 | if at.t == ntOne { | ||
| 358 | prev.str = append(prev.str, at.ch) | ||
| 359 | } else { | ||
| 360 | prev.str = append(prev.str, at.str...) | ||
| 361 | } | ||
| 362 | } else { | ||
| 363 | if at.t == ntOne { | ||
| 364 | // insert at the front by expanding our slice, copying the data over, and then setting the value | ||
| 365 | prev.str = append(prev.str, 0) | ||
| 366 | copy(prev.str[1:], prev.str) | ||
| 367 | prev.str[0] = at.ch | ||
| 368 | } else { | ||
| 369 | //insert at the front...this one we'll make a new slice and copy both into it | ||
| 370 | merge := make([]rune, len(prev.str)+len(at.str)) | ||
| 371 | copy(merge, at.str) | ||
| 372 | copy(merge[len(at.str):], prev.str) | ||
| 373 | prev.str = merge | ||
| 374 | } | ||
| 375 | } | ||
| 376 | } else if at.t == ntEmpty { | ||
| 377 | j-- | ||
| 378 | } else { | ||
| 379 | wasLastString = false | ||
| 380 | } | ||
| 381 | } | ||
| 382 | |||
| 383 | if j < i { | ||
| 384 | // remove indices j through i from the children | ||
| 385 | n.removeChildren(j, i) | ||
| 386 | } | ||
| 387 | |||
| 388 | return n.stripEnation(ntEmpty) | ||
| 389 | } | ||
| 390 | |||
| 391 | // Nested repeaters just get multiplied with each other if they're not | ||
| 392 | // too lumpy | ||
| 393 | func (n *regexNode) reduceRep() *regexNode { | ||
| 394 | |||
| 395 | u := n | ||
| 396 | t := n.t | ||
| 397 | min := n.m | ||
| 398 | max := n.n | ||
| 399 | |||
| 400 | for { | ||
| 401 | if len(u.children) == 0 { | ||
| 402 | break | ||
| 403 | } | ||
| 404 | |||
| 405 | child := u.children[0] | ||
| 406 | |||
| 407 | // multiply reps of the same type only | ||
| 408 | if child.t != t { | ||
| 409 | childType := child.t | ||
| 410 | |||
| 411 | if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop || | ||
| 412 | childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) { | ||
| 413 | break | ||
| 414 | } | ||
| 415 | } | ||
| 416 | |||
| 417 | // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})? | ||
| 418 | // [but things like (a {2,})+ are not too lumpy...] | ||
| 419 | if u.m == 0 && child.m > 1 || child.n < child.m*2 { | ||
| 420 | break | ||
| 421 | } | ||
| 422 | |||
| 423 | u = child | ||
| 424 | if u.m > 0 { | ||
| 425 | if (math.MaxInt32-1)/u.m < min { | ||
| 426 | u.m = math.MaxInt32 | ||
| 427 | } else { | ||
| 428 | u.m = u.m * min | ||
| 429 | } | ||
| 430 | } | ||
| 431 | if u.n > 0 { | ||
| 432 | if (math.MaxInt32-1)/u.n < max { | ||
| 433 | u.n = math.MaxInt32 | ||
| 434 | } else { | ||
| 435 | u.n = u.n * max | ||
| 436 | } | ||
| 437 | } | ||
| 438 | } | ||
| 439 | |||
| 440 | if math.MaxInt32 == min { | ||
| 441 | return newRegexNode(ntNothing, n.options) | ||
| 442 | } | ||
| 443 | return u | ||
| 444 | |||
| 445 | } | ||
| 446 | |||
| 447 | // Simple optimization. If a concatenation or alternation has only | ||
| 448 | // one child strip out the intermediate node. If it has zero children, | ||
| 449 | // turn it into an empty. | ||
| 450 | func (n *regexNode) stripEnation(emptyType nodeType) *regexNode { | ||
| 451 | switch len(n.children) { | ||
| 452 | case 0: | ||
| 453 | return newRegexNode(emptyType, n.options) | ||
| 454 | case 1: | ||
| 455 | return n.children[0] | ||
| 456 | default: | ||
| 457 | return n | ||
| 458 | } | ||
| 459 | } | ||
| 460 | |||
| 461 | func (n *regexNode) reduceGroup() *regexNode { | ||
| 462 | u := n | ||
| 463 | |||
| 464 | for u.t == ntGroup { | ||
| 465 | u = u.children[0] | ||
| 466 | } | ||
| 467 | |||
| 468 | return u | ||
| 469 | } | ||
| 470 | |||
| 471 | // Simple optimization. If a set is a singleton, an inverse singleton, | ||
| 472 | // or empty, it's transformed accordingly. | ||
| 473 | func (n *regexNode) reduceSet() *regexNode { | ||
| 474 | // Extract empty-set, one and not-one case as special | ||
| 475 | |||
| 476 | if n.set == nil { | ||
| 477 | n.t = ntNothing | ||
| 478 | } else if n.set.IsSingleton() { | ||
| 479 | n.ch = n.set.SingletonChar() | ||
| 480 | n.set = nil | ||
| 481 | n.t += (ntOne - ntSet) | ||
| 482 | } else if n.set.IsSingletonInverse() { | ||
| 483 | n.ch = n.set.SingletonChar() | ||
| 484 | n.set = nil | ||
| 485 | n.t += (ntNotone - ntSet) | ||
| 486 | } | ||
| 487 | |||
| 488 | return n | ||
| 489 | } | ||
| 490 | |||
| 491 | func (n *regexNode) reverseLeft() *regexNode { | ||
| 492 | if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 { | ||
| 493 | //reverse children order | ||
| 494 | for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 { | ||
| 495 | n.children[left], n.children[right] = n.children[right], n.children[left] | ||
| 496 | } | ||
| 497 | } | ||
| 498 | |||
| 499 | return n | ||
| 500 | } | ||
| 501 | |||
| 502 | func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode { | ||
| 503 | if min == 0 && max == 0 { | ||
| 504 | return newRegexNode(ntEmpty, n.options) | ||
| 505 | } | ||
| 506 | |||
| 507 | if min == 1 && max == 1 { | ||
| 508 | return n | ||
| 509 | } | ||
| 510 | |||
| 511 | switch n.t { | ||
| 512 | case ntOne, ntNotone, ntSet: | ||
| 513 | if lazy { | ||
| 514 | n.makeRep(Onelazy, min, max) | ||
| 515 | } else { | ||
| 516 | n.makeRep(Oneloop, min, max) | ||
| 517 | } | ||
| 518 | return n | ||
| 519 | |||
| 520 | default: | ||
| 521 | var t nodeType | ||
| 522 | if lazy { | ||
| 523 | t = ntLazyloop | ||
| 524 | } else { | ||
| 525 | t = ntLoop | ||
| 526 | } | ||
| 527 | result := newRegexNodeMN(t, n.options, min, max) | ||
| 528 | result.addChild(n) | ||
| 529 | return result | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | // debug functions | ||
| 534 | |||
| 535 | var typeStr = []string{ | ||
| 536 | "Onerep", "Notonerep", "Setrep", | ||
| 537 | "Oneloop", "Notoneloop", "Setloop", | ||
| 538 | "Onelazy", "Notonelazy", "Setlazy", | ||
| 539 | "One", "Notone", "Set", | ||
| 540 | "Multi", "Ref", | ||
| 541 | "Bol", "Eol", "Boundary", "Nonboundary", | ||
| 542 | "Beginning", "Start", "EndZ", "End", | ||
| 543 | "Nothing", "Empty", | ||
| 544 | "Alternate", "Concatenate", | ||
| 545 | "Loop", "Lazyloop", | ||
| 546 | "Capture", "Group", "Require", "Prevent", "Greedy", | ||
| 547 | "Testref", "Testgroup", | ||
| 548 | "Unknown", "Unknown", "Unknown", | ||
| 549 | "Unknown", "Unknown", "Unknown", | ||
| 550 | "ECMABoundary", "NonECMABoundary", | ||
| 551 | } | ||
| 552 | |||
| 553 | func (n *regexNode) description() string { | ||
| 554 | buf := &bytes.Buffer{} | ||
| 555 | |||
| 556 | buf.WriteString(typeStr[n.t]) | ||
| 557 | |||
| 558 | if (n.options & ExplicitCapture) != 0 { | ||
| 559 | buf.WriteString("-C") | ||
| 560 | } | ||
| 561 | if (n.options & IgnoreCase) != 0 { | ||
| 562 | buf.WriteString("-I") | ||
| 563 | } | ||
| 564 | if (n.options & RightToLeft) != 0 { | ||
| 565 | buf.WriteString("-L") | ||
| 566 | } | ||
| 567 | if (n.options & Multiline) != 0 { | ||
| 568 | buf.WriteString("-M") | ||
| 569 | } | ||
| 570 | if (n.options & Singleline) != 0 { | ||
| 571 | buf.WriteString("-S") | ||
| 572 | } | ||
| 573 | if (n.options & IgnorePatternWhitespace) != 0 { | ||
| 574 | buf.WriteString("-X") | ||
| 575 | } | ||
| 576 | if (n.options & ECMAScript) != 0 { | ||
| 577 | buf.WriteString("-E") | ||
| 578 | } | ||
| 579 | |||
| 580 | switch n.t { | ||
| 581 | case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone: | ||
| 582 | buf.WriteString("(Ch = " + CharDescription(n.ch) + ")") | ||
| 583 | break | ||
| 584 | case ntCapture: | ||
| 585 | buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")") | ||
| 586 | break | ||
| 587 | case ntRef, ntTestref: | ||
| 588 | buf.WriteString("(index = " + strconv.Itoa(n.m) + ")") | ||
| 589 | break | ||
| 590 | case ntMulti: | ||
| 591 | fmt.Fprintf(buf, "(String = %s)", string(n.str)) | ||
| 592 | break | ||
| 593 | case ntSet, ntSetloop, ntSetlazy: | ||
| 594 | buf.WriteString("(Set = " + n.set.String() + ")") | ||
| 595 | break | ||
| 596 | } | ||
| 597 | |||
| 598 | switch n.t { | ||
| 599 | case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop: | ||
| 600 | buf.WriteString("(Min = ") | ||
| 601 | buf.WriteString(strconv.Itoa(n.m)) | ||
| 602 | buf.WriteString(", Max = ") | ||
| 603 | if n.n == math.MaxInt32 { | ||
| 604 | buf.WriteString("inf") | ||
| 605 | } else { | ||
| 606 | buf.WriteString(strconv.Itoa(n.n)) | ||
| 607 | } | ||
| 608 | buf.WriteString(")") | ||
| 609 | |||
| 610 | break | ||
| 611 | } | ||
| 612 | |||
| 613 | return buf.String() | ||
| 614 | } | ||
| 615 | |||
| 616 | var padSpace = []byte(" ") | ||
| 617 | |||
| 618 | func (t *RegexTree) Dump() string { | ||
| 619 | return t.root.dump() | ||
| 620 | } | ||
| 621 | |||
| 622 | func (n *regexNode) dump() string { | ||
| 623 | var stack []int | ||
| 624 | CurNode := n | ||
| 625 | CurChild := 0 | ||
| 626 | |||
| 627 | buf := bytes.NewBufferString(CurNode.description()) | ||
| 628 | buf.WriteRune('\n') | ||
| 629 | |||
| 630 | for { | ||
| 631 | if CurNode.children != nil && CurChild < len(CurNode.children) { | ||
| 632 | stack = append(stack, CurChild+1) | ||
| 633 | CurNode = CurNode.children[CurChild] | ||
| 634 | CurChild = 0 | ||
| 635 | |||
| 636 | Depth := len(stack) | ||
| 637 | if Depth > 32 { | ||
| 638 | Depth = 32 | ||
| 639 | } | ||
| 640 | buf.Write(padSpace[:Depth]) | ||
| 641 | buf.WriteString(CurNode.description()) | ||
| 642 | buf.WriteRune('\n') | ||
| 643 | } else { | ||
| 644 | if len(stack) == 0 { | ||
| 645 | break | ||
| 646 | } | ||
| 647 | |||
| 648 | CurChild = stack[len(stack)-1] | ||
| 649 | stack = stack[:len(stack)-1] | ||
| 650 | CurNode = CurNode.next | ||
| 651 | } | ||
| 652 | } | ||
| 653 | return buf.String() | ||
| 654 | } | ||
diff --git a/vendor/github.com/dlclark/regexp2/syntax/writer.go b/vendor/github.com/dlclark/regexp2/syntax/writer.go new file mode 100644 index 0000000..a5aa11c --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go | |||
| @@ -0,0 +1,500 @@ | |||
| 1 | package syntax | ||
| 2 | |||
| 3 | import ( | ||
| 4 | "bytes" | ||
| 5 | "fmt" | ||
| 6 | "math" | ||
| 7 | "os" | ||
| 8 | ) | ||
| 9 | |||
| 10 | func Write(tree *RegexTree) (*Code, error) { | ||
| 11 | w := writer{ | ||
| 12 | intStack: make([]int, 0, 32), | ||
| 13 | emitted: make([]int, 2), | ||
| 14 | stringhash: make(map[string]int), | ||
| 15 | sethash: make(map[string]int), | ||
| 16 | } | ||
| 17 | |||
| 18 | code, err := w.codeFromTree(tree) | ||
| 19 | |||
| 20 | if tree.options&Debug > 0 && code != nil { | ||
| 21 | os.Stdout.WriteString(code.Dump()) | ||
| 22 | os.Stdout.WriteString("\n") | ||
| 23 | } | ||
| 24 | |||
| 25 | return code, err | ||
| 26 | } | ||
| 27 | |||
| 28 | type writer struct { | ||
| 29 | emitted []int | ||
| 30 | |||
| 31 | intStack []int | ||
| 32 | curpos int | ||
| 33 | stringhash map[string]int | ||
| 34 | stringtable [][]rune | ||
| 35 | sethash map[string]int | ||
| 36 | settable []*CharSet | ||
| 37 | counting bool | ||
| 38 | count int | ||
| 39 | trackcount int | ||
| 40 | caps map[int]int | ||
| 41 | } | ||
| 42 | |||
| 43 | const ( | ||
| 44 | beforeChild nodeType = 64 | ||
| 45 | afterChild = 128 | ||
| 46 | //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix | ||
| 47 | MaxPrefixSize = 50 | ||
| 48 | ) | ||
| 49 | |||
| 50 | // The top level RegexCode generator. It does a depth-first walk | ||
| 51 | // through the tree and calls EmitFragment to emits code before | ||
| 52 | // and after each child of an interior node, and at each leaf. | ||
| 53 | // | ||
| 54 | // It runs two passes, first to count the size of the generated | ||
| 55 | // code, and second to generate the code. | ||
| 56 | // | ||
| 57 | // We should time it against the alternative, which is | ||
| 58 | // to just generate the code and grow the array as we go. | ||
| 59 | func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) { | ||
| 60 | var ( | ||
| 61 | curNode *regexNode | ||
| 62 | curChild int | ||
| 63 | capsize int | ||
| 64 | ) | ||
| 65 | // construct sparse capnum mapping if some numbers are unused | ||
| 66 | |||
| 67 | if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) { | ||
| 68 | capsize = tree.captop | ||
| 69 | w.caps = nil | ||
| 70 | } else { | ||
| 71 | capsize = len(tree.capnumlist) | ||
| 72 | w.caps = tree.caps | ||
| 73 | for i := 0; i < len(tree.capnumlist); i++ { | ||
| 74 | w.caps[tree.capnumlist[i]] = i | ||
| 75 | } | ||
| 76 | } | ||
| 77 | |||
| 78 | w.counting = true | ||
| 79 | |||
| 80 | for { | ||
| 81 | if !w.counting { | ||
| 82 | w.emitted = make([]int, w.count) | ||
| 83 | } | ||
| 84 | |||
| 85 | curNode = tree.root | ||
| 86 | curChild = 0 | ||
| 87 | |||
| 88 | w.emit1(Lazybranch, 0) | ||
| 89 | |||
| 90 | for { | ||
| 91 | if len(curNode.children) == 0 { | ||
| 92 | w.emitFragment(curNode.t, curNode, 0) | ||
| 93 | } else if curChild < len(curNode.children) { | ||
| 94 | w.emitFragment(curNode.t|beforeChild, curNode, curChild) | ||
| 95 | |||
| 96 | curNode = curNode.children[curChild] | ||
| 97 | |||
| 98 | w.pushInt(curChild) | ||
| 99 | curChild = 0 | ||
| 100 | continue | ||
| 101 | } | ||
| 102 | |||
| 103 | if w.emptyStack() { | ||
| 104 | break | ||
| 105 | } | ||
| 106 | |||
| 107 | curChild = w.popInt() | ||
| 108 | curNode = curNode.next | ||
| 109 | |||
| 110 | w.emitFragment(curNode.t|afterChild, curNode, curChild) | ||
| 111 | curChild++ | ||
| 112 | } | ||
| 113 | |||
| 114 | w.patchJump(0, w.curPos()) | ||
| 115 | w.emit(Stop) | ||
| 116 | |||
| 117 | if !w.counting { | ||
| 118 | break | ||
| 119 | } | ||
| 120 | |||
| 121 | w.counting = false | ||
| 122 | } | ||
| 123 | |||
| 124 | fcPrefix := getFirstCharsPrefix(tree) | ||
| 125 | prefix := getPrefix(tree) | ||
| 126 | rtl := (tree.options & RightToLeft) != 0 | ||
| 127 | |||
| 128 | var bmPrefix *BmPrefix | ||
| 129 | //TODO: benchmark string prefixes | ||
| 130 | if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 { | ||
| 131 | if len(prefix.PrefixStr) > MaxPrefixSize { | ||
| 132 | // limit prefix changes to 10k | ||
| 133 | prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize] | ||
| 134 | } | ||
| 135 | bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl) | ||
| 136 | } else { | ||
| 137 | bmPrefix = nil | ||
| 138 | } | ||
| 139 | |||
| 140 | return &Code{ | ||
| 141 | Codes: w.emitted, | ||
| 142 | Strings: w.stringtable, | ||
| 143 | Sets: w.settable, | ||
| 144 | TrackCount: w.trackcount, | ||
| 145 | Caps: w.caps, | ||
| 146 | Capsize: capsize, | ||
| 147 | FcPrefix: fcPrefix, | ||
| 148 | BmPrefix: bmPrefix, | ||
| 149 | Anchors: getAnchors(tree), | ||
| 150 | RightToLeft: rtl, | ||
| 151 | }, nil | ||
| 152 | } | ||
| 153 | |||
| 154 | // The main RegexCode generator. It does a depth-first walk | ||
| 155 | // through the tree and calls EmitFragment to emits code before | ||
| 156 | // and after each child of an interior node, and at each leaf. | ||
| 157 | func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error { | ||
| 158 | bits := InstOp(0) | ||
| 159 | |||
| 160 | if nodetype <= ntRef { | ||
| 161 | if (node.options & RightToLeft) != 0 { | ||
| 162 | bits |= Rtl | ||
| 163 | } | ||
| 164 | if (node.options & IgnoreCase) != 0 { | ||
| 165 | bits |= Ci | ||
| 166 | } | ||
| 167 | } | ||
| 168 | ntBits := nodeType(bits) | ||
| 169 | |||
| 170 | switch nodetype { | ||
| 171 | case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty: | ||
| 172 | break | ||
| 173 | |||
| 174 | case ntAlternate | beforeChild: | ||
| 175 | if curIndex < len(node.children)-1 { | ||
| 176 | w.pushInt(w.curPos()) | ||
| 177 | w.emit1(Lazybranch, 0) | ||
| 178 | } | ||
| 179 | |||
| 180 | case ntAlternate | afterChild: | ||
| 181 | if curIndex < len(node.children)-1 { | ||
| 182 | lbPos := w.popInt() | ||
| 183 | w.pushInt(w.curPos()) | ||
| 184 | w.emit1(Goto, 0) | ||
| 185 | w.patchJump(lbPos, w.curPos()) | ||
| 186 | } else { | ||
| 187 | for i := 0; i < curIndex; i++ { | ||
| 188 | w.patchJump(w.popInt(), w.curPos()) | ||
| 189 | } | ||
| 190 | } | ||
| 191 | break | ||
| 192 | |||
| 193 | case ntTestref | beforeChild: | ||
| 194 | if curIndex == 0 { | ||
| 195 | w.emit(Setjump) | ||
| 196 | w.pushInt(w.curPos()) | ||
| 197 | w.emit1(Lazybranch, 0) | ||
| 198 | w.emit1(Testref, w.mapCapnum(node.m)) | ||
| 199 | w.emit(Forejump) | ||
| 200 | } | ||
| 201 | |||
| 202 | case ntTestref | afterChild: | ||
| 203 | if curIndex == 0 { | ||
| 204 | branchpos := w.popInt() | ||
| 205 | w.pushInt(w.curPos()) | ||
| 206 | w.emit1(Goto, 0) | ||
| 207 | w.patchJump(branchpos, w.curPos()) | ||
| 208 | w.emit(Forejump) | ||
| 209 | if len(node.children) <= 1 { | ||
| 210 | w.patchJump(w.popInt(), w.curPos()) | ||
| 211 | } | ||
| 212 | } else if curIndex == 1 { | ||
| 213 | w.patchJump(w.popInt(), w.curPos()) | ||
| 214 | } | ||
| 215 | |||
| 216 | case ntTestgroup | beforeChild: | ||
| 217 | if curIndex == 0 { | ||
| 218 | w.emit(Setjump) | ||
| 219 | w.emit(Setmark) | ||
| 220 | w.pushInt(w.curPos()) | ||
| 221 | w.emit1(Lazybranch, 0) | ||
| 222 | } | ||
| 223 | |||
| 224 | case ntTestgroup | afterChild: | ||
| 225 | if curIndex == 0 { | ||
| 226 | w.emit(Getmark) | ||
| 227 | w.emit(Forejump) | ||
| 228 | } else if curIndex == 1 { | ||
| 229 | Branchpos := w.popInt() | ||
| 230 | w.pushInt(w.curPos()) | ||
| 231 | w.emit1(Goto, 0) | ||
| 232 | w.patchJump(Branchpos, w.curPos()) | ||
| 233 | w.emit(Getmark) | ||
| 234 | w.emit(Forejump) | ||
| 235 | if len(node.children) <= 2 { | ||
| 236 | w.patchJump(w.popInt(), w.curPos()) | ||
| 237 | } | ||
| 238 | } else if curIndex == 2 { | ||
| 239 | w.patchJump(w.popInt(), w.curPos()) | ||
| 240 | } | ||
| 241 | |||
| 242 | case ntLoop | beforeChild, ntLazyloop | beforeChild: | ||
| 243 | |||
| 244 | if node.n < math.MaxInt32 || node.m > 1 { | ||
| 245 | if node.m == 0 { | ||
| 246 | w.emit1(Nullcount, 0) | ||
| 247 | } else { | ||
| 248 | w.emit1(Setcount, 1-node.m) | ||
| 249 | } | ||
| 250 | } else if node.m == 0 { | ||
| 251 | w.emit(Nullmark) | ||
| 252 | } else { | ||
| 253 | w.emit(Setmark) | ||
| 254 | } | ||
| 255 | |||
| 256 | if node.m == 0 { | ||
| 257 | w.pushInt(w.curPos()) | ||
| 258 | w.emit1(Goto, 0) | ||
| 259 | } | ||
| 260 | w.pushInt(w.curPos()) | ||
| 261 | |||
| 262 | case ntLoop | afterChild, ntLazyloop | afterChild: | ||
| 263 | |||
| 264 | startJumpPos := w.curPos() | ||
| 265 | lazy := (nodetype - (ntLoop | afterChild)) | ||
| 266 | |||
| 267 | if node.n < math.MaxInt32 || node.m > 1 { | ||
| 268 | if node.n == math.MaxInt32 { | ||
| 269 | w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32) | ||
| 270 | } else { | ||
| 271 | w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m) | ||
| 272 | } | ||
| 273 | } else { | ||
| 274 | w.emit1(InstOp(Branchmark+lazy), w.popInt()) | ||
| 275 | } | ||
| 276 | |||
| 277 | if node.m == 0 { | ||
| 278 | w.patchJump(w.popInt(), startJumpPos) | ||
| 279 | } | ||
| 280 | |||
| 281 | case ntGroup | beforeChild, ntGroup | afterChild: | ||
| 282 | |||
| 283 | case ntCapture | beforeChild: | ||
| 284 | w.emit(Setmark) | ||
| 285 | |||
| 286 | case ntCapture | afterChild: | ||
| 287 | w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n)) | ||
| 288 | |||
| 289 | case ntRequire | beforeChild: | ||
| 290 | // NOTE: the following line causes lookahead/lookbehind to be | ||
| 291 | // NON-BACKTRACKING. It can be commented out with (*) | ||
| 292 | w.emit(Setjump) | ||
| 293 | |||
| 294 | w.emit(Setmark) | ||
| 295 | |||
| 296 | case ntRequire | afterChild: | ||
| 297 | w.emit(Getmark) | ||
| 298 | |||
| 299 | // NOTE: the following line causes lookahead/lookbehind to be | ||
| 300 | // NON-BACKTRACKING. It can be commented out with (*) | ||
| 301 | w.emit(Forejump) | ||
| 302 | |||
| 303 | case ntPrevent | beforeChild: | ||
| 304 | w.emit(Setjump) | ||
| 305 | w.pushInt(w.curPos()) | ||
| 306 | w.emit1(Lazybranch, 0) | ||
| 307 | |||
| 308 | case ntPrevent | afterChild: | ||
| 309 | w.emit(Backjump) | ||
| 310 | w.patchJump(w.popInt(), w.curPos()) | ||
| 311 | w.emit(Forejump) | ||
| 312 | |||
| 313 | case ntGreedy | beforeChild: | ||
| 314 | w.emit(Setjump) | ||
| 315 | |||
| 316 | case ntGreedy | afterChild: | ||
| 317 | w.emit(Forejump) | ||
| 318 | |||
| 319 | case ntOne, ntNotone: | ||
| 320 | w.emit1(InstOp(node.t|ntBits), int(node.ch)) | ||
| 321 | |||
| 322 | case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy: | ||
| 323 | if node.m > 0 { | ||
| 324 | if node.t == ntOneloop || node.t == ntOnelazy { | ||
| 325 | w.emit2(Onerep|bits, int(node.ch), node.m) | ||
| 326 | } else { | ||
| 327 | w.emit2(Notonerep|bits, int(node.ch), node.m) | ||
| 328 | } | ||
| 329 | } | ||
| 330 | if node.n > node.m { | ||
| 331 | if node.n == math.MaxInt32 { | ||
| 332 | w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32) | ||
| 333 | } else { | ||
| 334 | w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m) | ||
| 335 | } | ||
| 336 | } | ||
| 337 | |||
| 338 | case ntSetloop, ntSetlazy: | ||
| 339 | if node.m > 0 { | ||
| 340 | w.emit2(Setrep|bits, w.setCode(node.set), node.m) | ||
| 341 | } | ||
| 342 | if node.n > node.m { | ||
| 343 | if node.n == math.MaxInt32 { | ||
| 344 | w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32) | ||
| 345 | } else { | ||
| 346 | w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m) | ||
| 347 | } | ||
| 348 | } | ||
| 349 | |||
| 350 | case ntMulti: | ||
| 351 | w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str)) | ||
| 352 | |||
| 353 | case ntSet: | ||
| 354 | w.emit1(InstOp(node.t|ntBits), w.setCode(node.set)) | ||
| 355 | |||
| 356 | case ntRef: | ||
| 357 | w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m)) | ||
| 358 | |||
| 359 | case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd: | ||
| 360 | w.emit(InstOp(node.t)) | ||
| 361 | |||
| 362 | default: | ||
| 363 | return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype) | ||
| 364 | } | ||
| 365 | |||
| 366 | return nil | ||
| 367 | } | ||
| 368 | |||
| 369 | // To avoid recursion, we use a simple integer stack. | ||
| 370 | // This is the push. | ||
| 371 | func (w *writer) pushInt(i int) { | ||
| 372 | w.intStack = append(w.intStack, i) | ||
| 373 | } | ||
| 374 | |||
| 375 | // Returns true if the stack is empty. | ||
| 376 | func (w *writer) emptyStack() bool { | ||
| 377 | return len(w.intStack) == 0 | ||
| 378 | } | ||
| 379 | |||
| 380 | // This is the pop. | ||
| 381 | func (w *writer) popInt() int { | ||
| 382 | //get our item | ||
| 383 | idx := len(w.intStack) - 1 | ||
| 384 | i := w.intStack[idx] | ||
| 385 | //trim our slice | ||
| 386 | w.intStack = w.intStack[:idx] | ||
| 387 | return i | ||
| 388 | } | ||
| 389 | |||
| 390 | // Returns the current position in the emitted code. | ||
| 391 | func (w *writer) curPos() int { | ||
| 392 | return w.curpos | ||
| 393 | } | ||
| 394 | |||
| 395 | // Fixes up a jump instruction at the specified offset | ||
| 396 | // so that it jumps to the specified jumpDest. | ||
| 397 | func (w *writer) patchJump(offset, jumpDest int) { | ||
| 398 | w.emitted[offset+1] = jumpDest | ||
| 399 | } | ||
| 400 | |||
| 401 | // Returns an index in the set table for a charset | ||
| 402 | // uses a map to eliminate duplicates. | ||
| 403 | func (w *writer) setCode(set *CharSet) int { | ||
| 404 | if w.counting { | ||
| 405 | return 0 | ||
| 406 | } | ||
| 407 | |||
| 408 | buf := &bytes.Buffer{} | ||
| 409 | |||
| 410 | set.mapHashFill(buf) | ||
| 411 | hash := buf.String() | ||
| 412 | i, ok := w.sethash[hash] | ||
| 413 | if !ok { | ||
| 414 | i = len(w.sethash) | ||
| 415 | w.sethash[hash] = i | ||
| 416 | w.settable = append(w.settable, set) | ||
| 417 | } | ||
| 418 | return i | ||
| 419 | } | ||
| 420 | |||
| 421 | // Returns an index in the string table for a string. | ||
| 422 | // uses a map to eliminate duplicates. | ||
| 423 | func (w *writer) stringCode(str []rune) int { | ||
| 424 | if w.counting { | ||
| 425 | return 0 | ||
| 426 | } | ||
| 427 | |||
| 428 | hash := string(str) | ||
| 429 | i, ok := w.stringhash[hash] | ||
| 430 | if !ok { | ||
| 431 | i = len(w.stringhash) | ||
| 432 | w.stringhash[hash] = i | ||
| 433 | w.stringtable = append(w.stringtable, str) | ||
| 434 | } | ||
| 435 | |||
| 436 | return i | ||
| 437 | } | ||
| 438 | |||
| 439 | // When generating code on a regex that uses a sparse set | ||
| 440 | // of capture slots, we hash them to a dense set of indices | ||
| 441 | // for an array of capture slots. Instead of doing the hash | ||
| 442 | // at match time, it's done at compile time, here. | ||
| 443 | func (w *writer) mapCapnum(capnum int) int { | ||
| 444 | if capnum == -1 { | ||
| 445 | return -1 | ||
| 446 | } | ||
| 447 | |||
| 448 | if w.caps != nil { | ||
| 449 | return w.caps[capnum] | ||
| 450 | } | ||
| 451 | |||
| 452 | return capnum | ||
| 453 | } | ||
| 454 | |||
| 455 | // Emits a zero-argument operation. Note that the emit | ||
| 456 | // functions all run in two modes: they can emit code, or | ||
| 457 | // they can just count the size of the code. | ||
| 458 | func (w *writer) emit(op InstOp) { | ||
| 459 | if w.counting { | ||
| 460 | w.count++ | ||
| 461 | if opcodeBacktracks(op) { | ||
| 462 | w.trackcount++ | ||
| 463 | } | ||
| 464 | return | ||
| 465 | } | ||
| 466 | w.emitted[w.curpos] = int(op) | ||
| 467 | w.curpos++ | ||
| 468 | } | ||
| 469 | |||
| 470 | // Emits a one-argument operation. | ||
| 471 | func (w *writer) emit1(op InstOp, opd1 int) { | ||
| 472 | if w.counting { | ||
| 473 | w.count += 2 | ||
| 474 | if opcodeBacktracks(op) { | ||
| 475 | w.trackcount++ | ||
| 476 | } | ||
| 477 | return | ||
| 478 | } | ||
| 479 | w.emitted[w.curpos] = int(op) | ||
| 480 | w.curpos++ | ||
| 481 | w.emitted[w.curpos] = opd1 | ||
| 482 | w.curpos++ | ||
| 483 | } | ||
| 484 | |||
| 485 | // Emits a two-argument operation. | ||
| 486 | func (w *writer) emit2(op InstOp, opd1, opd2 int) { | ||
| 487 | if w.counting { | ||
| 488 | w.count += 3 | ||
| 489 | if opcodeBacktracks(op) { | ||
| 490 | w.trackcount++ | ||
| 491 | } | ||
| 492 | return | ||
| 493 | } | ||
| 494 | w.emitted[w.curpos] = int(op) | ||
| 495 | w.curpos++ | ||
| 496 | w.emitted[w.curpos] = opd1 | ||
| 497 | w.curpos++ | ||
| 498 | w.emitted[w.curpos] = opd2 | ||
| 499 | w.curpos++ | ||
| 500 | } | ||
