diff options
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax/tree.go')
| -rw-r--r-- | vendor/github.com/dlclark/regexp2/syntax/tree.go | 654 |
1 files changed, 654 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/tree.go b/vendor/github.com/dlclark/regexp2/syntax/tree.go new file mode 100644 index 0000000..ea28829 --- /dev/null +++ b/vendor/github.com/dlclark/regexp2/syntax/tree.go @@ -0,0 +1,654 @@ +package syntax + +import ( + "bytes" + "fmt" + "math" + "strconv" +) + +type RegexTree struct { + root *regexNode + caps map[int]int + capnumlist []int + captop int + Capnames map[string]int + Caplist []string + options RegexOptions +} + +// It is built into a parsed tree for a regular expression. + +// Implementation notes: +// +// Since the node tree is a temporary data structure only used +// during compilation of the regexp to integer codes, it's +// designed for clarity and convenience rather than +// space efficiency. +// +// RegexNodes are built into a tree, linked by the n.children list. +// Each node also has a n.parent and n.ichild member indicating +// its parent and which child # it is in its parent's list. +// +// RegexNodes come in as many types as there are constructs in +// a regular expression, for example, "concatenate", "alternate", +// "one", "rept", "group". There are also node types for basic +// peephole optimizations, e.g., "onerep", "notsetrep", etc. +// +// Because perl 5 allows "lookback" groups that scan backwards, +// each node also gets a "direction". Normally the value of +// boolean n.backward = false. +// +// During parsing, top-level nodes are also stacked onto a parse +// stack (a stack of trees). For this purpose we have a n.next +// pointer. [Note that to save a few bytes, we could overload the +// n.parent pointer instead.] +// +// On the parse stack, each tree has a "role" - basically, the +// nonterminal in the grammar that the parser has currently +// assigned to the tree. That code is stored in n.role. +// +// Finally, some of the different kinds of nodes have data. +// Two integers (for the looping constructs) are stored in +// n.operands, an an object (either a string or a set) +// is stored in n.data +type regexNode struct { + t nodeType + children []*regexNode + str []rune + set *CharSet + ch rune + m int + n int + options RegexOptions + next *regexNode +} + +type nodeType int32 + +const ( + // The following are leaves, and correspond to primitive operations + + ntOnerep nodeType = 0 // lef,back char,min,max a {n} + ntNotonerep = 1 // lef,back char,min,max .{n} + ntSetrep = 2 // lef,back set,min,max [\d]{n} + ntOneloop = 3 // lef,back char,min,max a {,n} + ntNotoneloop = 4 // lef,back char,min,max .{,n} + ntSetloop = 5 // lef,back set,min,max [\d]{,n} + ntOnelazy = 6 // lef,back char,min,max a {,n}? + ntNotonelazy = 7 // lef,back char,min,max .{,n}? + ntSetlazy = 8 // lef,back set,min,max [\d]{,n}? + ntOne = 9 // lef char a + ntNotone = 10 // lef char [^a] + ntSet = 11 // lef set [a-z\s] \w \s \d + ntMulti = 12 // lef string abcd + ntRef = 13 // lef group \# + ntBol = 14 // ^ + ntEol = 15 // $ + ntBoundary = 16 // \b + ntNonboundary = 17 // \B + ntBeginning = 18 // \A + ntStart = 19 // \G + ntEndZ = 20 // \Z + ntEnd = 21 // \Z + + // Interior nodes do not correspond to primitive operations, but + // control structures compositing other operations + + // Concat and alternate take n children, and can run forward or backwards + + ntNothing = 22 // [] + ntEmpty = 23 // () + ntAlternate = 24 // a|b + ntConcatenate = 25 // ab + ntLoop = 26 // m,x * + ? {,} + ntLazyloop = 27 // m,x *? +? ?? {,}? + ntCapture = 28 // n () + ntGroup = 29 // (?:) + ntRequire = 30 // (?=) (?<=) + ntPrevent = 31 // (?!) (?<!) + ntGreedy = 32 // (?>) (?<) + ntTestref = 33 // (?(n) | ) + ntTestgroup = 34 // (?(...) | ) + + ntECMABoundary = 41 // \b + ntNonECMABoundary = 42 // \B +) + +func newRegexNode(t nodeType, opt RegexOptions) *regexNode { + return ®exNode{ + t: t, + options: opt, + } +} + +func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode { + return ®exNode{ + t: t, + options: opt, + ch: ch, + } +} + +func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode { + return ®exNode{ + t: t, + options: opt, + str: str, + } +} + +func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode { + return ®exNode{ + t: t, + options: opt, + set: set, + } +} + +func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode { + return ®exNode{ + t: t, + options: opt, + m: m, + } +} +func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode { + return ®exNode{ + t: t, + options: opt, + m: m, + n: n, + } +} + +func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) { + for i := 0; i < len(n.str); i++ { + buf.WriteRune(n.str[i]) + } +} + +func (n *regexNode) addChild(child *regexNode) { + reduced := child.reduce() + n.children = append(n.children, reduced) + reduced.next = n +} + +func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) { + newChildren := make([]*regexNode, 0, len(n.children)+len(nodes)) + n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...) +} + +// removes children including the start but not the end index +func (n *regexNode) removeChildren(startIndex, endIndex int) { + n.children = append(n.children[:startIndex], n.children[endIndex:]...) +} + +// Pass type as OneLazy or OneLoop +func (n *regexNode) makeRep(t nodeType, min, max int) { + n.t += (t - ntOne) + n.m = min + n.n = max +} + +func (n *regexNode) reduce() *regexNode { + switch n.t { + case ntAlternate: + return n.reduceAlternation() + + case ntConcatenate: + return n.reduceConcatenation() + + case ntLoop, ntLazyloop: + return n.reduceRep() + + case ntGroup: + return n.reduceGroup() + + case ntSet, ntSetloop: + return n.reduceSet() + + default: + return n + } +} + +// Basic optimization. Single-letter alternations can be replaced +// by faster set specifications, and nested alternations with no +// intervening operators can be flattened: +// +// a|b|c|def|g|h -> [a-c]|def|[gh] +// apple|(?:orange|pear)|grape -> apple|orange|pear|grape +func (n *regexNode) reduceAlternation() *regexNode { + if len(n.children) == 0 { + return newRegexNode(ntNothing, n.options) + } + + wasLastSet := false + lastNodeCannotMerge := false + var optionsLast RegexOptions + var i, j int + + for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { + at := n.children[i] + + if j < i { + n.children[j] = at + } + + for { + if at.t == ntAlternate { + for k := 0; k < len(at.children); k++ { + at.children[k].next = n + } + n.insertChildren(i+1, at.children) + + j-- + } else if at.t == ntSet || at.t == ntOne { + // Cannot merge sets if L or I options differ, or if either are negated. + optionsAt := at.options & (RightToLeft | IgnoreCase) + + if at.t == ntSet { + if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() { + wasLastSet = true + lastNodeCannotMerge = !at.set.IsMergeable() + optionsLast = optionsAt + break + } + } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge { + wasLastSet = true + lastNodeCannotMerge = false + optionsLast = optionsAt + break + } + + // The last node was a Set or a One, we're a Set or One and our options are the same. + // Merge the two nodes. + j-- + prev := n.children[j] + + var prevCharClass *CharSet + if prev.t == ntOne { + prevCharClass = &CharSet{} + prevCharClass.addChar(prev.ch) + } else { + prevCharClass = prev.set + } + + if at.t == ntOne { + prevCharClass.addChar(at.ch) + } else { + prevCharClass.addSet(*at.set) + } + + prev.t = ntSet + prev.set = prevCharClass + } else if at.t == ntNothing { + j-- + } else { + wasLastSet = false + lastNodeCannotMerge = false + } + break + } + } + + if j < i { + n.removeChildren(j, i) + } + + return n.stripEnation(ntNothing) +} + +// Basic optimization. Adjacent strings can be concatenated. +// +// (?:abc)(?:def) -> abcdef +func (n *regexNode) reduceConcatenation() *regexNode { + // Eliminate empties and concat adjacent strings/chars + + var optionsLast RegexOptions + var optionsAt RegexOptions + var i, j int + + if len(n.children) == 0 { + return newRegexNode(ntEmpty, n.options) + } + + wasLastString := false + + for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { + var at, prev *regexNode + + at = n.children[i] + + if j < i { + n.children[j] = at + } + + if at.t == ntConcatenate && + ((at.options & RightToLeft) == (n.options & RightToLeft)) { + for k := 0; k < len(at.children); k++ { + at.children[k].next = n + } + + //insert at.children at i+1 index in n.children + n.insertChildren(i+1, at.children) + + j-- + } else if at.t == ntMulti || at.t == ntOne { + // Cannot merge strings if L or I options differ + optionsAt = at.options & (RightToLeft | IgnoreCase) + + if !wasLastString || optionsLast != optionsAt { + wasLastString = true + optionsLast = optionsAt + continue + } + + j-- + prev = n.children[j] + + if prev.t == ntOne { + prev.t = ntMulti + prev.str = []rune{prev.ch} + } + + if (optionsAt & RightToLeft) == 0 { + if at.t == ntOne { + prev.str = append(prev.str, at.ch) + } else { + prev.str = append(prev.str, at.str...) + } + } else { + if at.t == ntOne { + // insert at the front by expanding our slice, copying the data over, and then setting the value + prev.str = append(prev.str, 0) + copy(prev.str[1:], prev.str) + prev.str[0] = at.ch + } else { + //insert at the front...this one we'll make a new slice and copy both into it + merge := make([]rune, len(prev.str)+len(at.str)) + copy(merge, at.str) + copy(merge[len(at.str):], prev.str) + prev.str = merge + } + } + } else if at.t == ntEmpty { + j-- + } else { + wasLastString = false + } + } + + if j < i { + // remove indices j through i from the children + n.removeChildren(j, i) + } + + return n.stripEnation(ntEmpty) +} + +// Nested repeaters just get multiplied with each other if they're not +// too lumpy +func (n *regexNode) reduceRep() *regexNode { + + u := n + t := n.t + min := n.m + max := n.n + + for { + if len(u.children) == 0 { + break + } + + child := u.children[0] + + // multiply reps of the same type only + if child.t != t { + childType := child.t + + if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop || + childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) { + break + } + } + + // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})? + // [but things like (a {2,})+ are not too lumpy...] + if u.m == 0 && child.m > 1 || child.n < child.m*2 { + break + } + + u = child + if u.m > 0 { + if (math.MaxInt32-1)/u.m < min { + u.m = math.MaxInt32 + } else { + u.m = u.m * min + } + } + if u.n > 0 { + if (math.MaxInt32-1)/u.n < max { + u.n = math.MaxInt32 + } else { + u.n = u.n * max + } + } + } + + if math.MaxInt32 == min { + return newRegexNode(ntNothing, n.options) + } + return u + +} + +// Simple optimization. If a concatenation or alternation has only +// one child strip out the intermediate node. If it has zero children, +// turn it into an empty. +func (n *regexNode) stripEnation(emptyType nodeType) *regexNode { + switch len(n.children) { + case 0: + return newRegexNode(emptyType, n.options) + case 1: + return n.children[0] + default: + return n + } +} + +func (n *regexNode) reduceGroup() *regexNode { + u := n + + for u.t == ntGroup { + u = u.children[0] + } + + return u +} + +// Simple optimization. If a set is a singleton, an inverse singleton, +// or empty, it's transformed accordingly. +func (n *regexNode) reduceSet() *regexNode { + // Extract empty-set, one and not-one case as special + + if n.set == nil { + n.t = ntNothing + } else if n.set.IsSingleton() { + n.ch = n.set.SingletonChar() + n.set = nil + n.t += (ntOne - ntSet) + } else if n.set.IsSingletonInverse() { + n.ch = n.set.SingletonChar() + n.set = nil + n.t += (ntNotone - ntSet) + } + + return n +} + +func (n *regexNode) reverseLeft() *regexNode { + if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 { + //reverse children order + for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 { + n.children[left], n.children[right] = n.children[right], n.children[left] + } + } + + return n +} + +func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode { + if min == 0 && max == 0 { + return newRegexNode(ntEmpty, n.options) + } + + if min == 1 && max == 1 { + return n + } + + switch n.t { + case ntOne, ntNotone, ntSet: + if lazy { + n.makeRep(Onelazy, min, max) + } else { + n.makeRep(Oneloop, min, max) + } + return n + + default: + var t nodeType + if lazy { + t = ntLazyloop + } else { + t = ntLoop + } + result := newRegexNodeMN(t, n.options, min, max) + result.addChild(n) + return result + } +} + +// debug functions + +var typeStr = []string{ + "Onerep", "Notonerep", "Setrep", + "Oneloop", "Notoneloop", "Setloop", + "Onelazy", "Notonelazy", "Setlazy", + "One", "Notone", "Set", + "Multi", "Ref", + "Bol", "Eol", "Boundary", "Nonboundary", + "Beginning", "Start", "EndZ", "End", + "Nothing", "Empty", + "Alternate", "Concatenate", + "Loop", "Lazyloop", + "Capture", "Group", "Require", "Prevent", "Greedy", + "Testref", "Testgroup", + "Unknown", "Unknown", "Unknown", + "Unknown", "Unknown", "Unknown", + "ECMABoundary", "NonECMABoundary", +} + +func (n *regexNode) description() string { + buf := &bytes.Buffer{} + + buf.WriteString(typeStr[n.t]) + + if (n.options & ExplicitCapture) != 0 { + buf.WriteString("-C") + } + if (n.options & IgnoreCase) != 0 { + buf.WriteString("-I") + } + if (n.options & RightToLeft) != 0 { + buf.WriteString("-L") + } + if (n.options & Multiline) != 0 { + buf.WriteString("-M") + } + if (n.options & Singleline) != 0 { + buf.WriteString("-S") + } + if (n.options & IgnorePatternWhitespace) != 0 { + buf.WriteString("-X") + } + if (n.options & ECMAScript) != 0 { + buf.WriteString("-E") + } + + switch n.t { + case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone: + buf.WriteString("(Ch = " + CharDescription(n.ch) + ")") + break + case ntCapture: + buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")") + break + case ntRef, ntTestref: + buf.WriteString("(index = " + strconv.Itoa(n.m) + ")") + break + case ntMulti: + fmt.Fprintf(buf, "(String = %s)", string(n.str)) + break + case ntSet, ntSetloop, ntSetlazy: + buf.WriteString("(Set = " + n.set.String() + ")") + break + } + + switch n.t { + case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop: + buf.WriteString("(Min = ") + buf.WriteString(strconv.Itoa(n.m)) + buf.WriteString(", Max = ") + if n.n == math.MaxInt32 { + buf.WriteString("inf") + } else { + buf.WriteString(strconv.Itoa(n.n)) + } + buf.WriteString(")") + + break + } + + return buf.String() +} + +var padSpace = []byte(" ") + +func (t *RegexTree) Dump() string { + return t.root.dump() +} + +func (n *regexNode) dump() string { + var stack []int + CurNode := n + CurChild := 0 + + buf := bytes.NewBufferString(CurNode.description()) + buf.WriteRune('\n') + + for { + if CurNode.children != nil && CurChild < len(CurNode.children) { + stack = append(stack, CurChild+1) + CurNode = CurNode.children[CurChild] + CurChild = 0 + + Depth := len(stack) + if Depth > 32 { + Depth = 32 + } + buf.Write(padSpace[:Depth]) + buf.WriteString(CurNode.description()) + buf.WriteRune('\n') + } else { + if len(stack) == 0 { + break + } + + CurChild = stack[len(stack)-1] + stack = stack[:len(stack)-1] + CurNode = CurNode.next + } + } + return buf.String() +} |
