summaryrefslogtreecommitdiff
path: root/vendor/github.com/dlclark/regexp2/syntax/tree.go
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
committerMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
commitc6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/dlclark/regexp2/syntax/tree.go
parent0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
downloadjbmafp-c6cc0108ca7738023b45e0eeac0fa2390532dd93.tar.gz
Added vendor lock on depsHEADmaster
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax/tree.go')
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/tree.go654
1 files changed, 654 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/tree.go b/vendor/github.com/dlclark/regexp2/syntax/tree.go
new file mode 100644
index 0000000..ea28829
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/tree.go
@@ -0,0 +1,654 @@
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+ "strconv"
+)
+
+type RegexTree struct {
+ root *regexNode
+ caps map[int]int
+ capnumlist []int
+ captop int
+ Capnames map[string]int
+ Caplist []string
+ options RegexOptions
+}
+
+// It is built into a parsed tree for a regular expression.
+
+// Implementation notes:
+//
+// Since the node tree is a temporary data structure only used
+// during compilation of the regexp to integer codes, it's
+// designed for clarity and convenience rather than
+// space efficiency.
+//
+// RegexNodes are built into a tree, linked by the n.children list.
+// Each node also has a n.parent and n.ichild member indicating
+// its parent and which child # it is in its parent's list.
+//
+// RegexNodes come in as many types as there are constructs in
+// a regular expression, for example, "concatenate", "alternate",
+// "one", "rept", "group". There are also node types for basic
+// peephole optimizations, e.g., "onerep", "notsetrep", etc.
+//
+// Because perl 5 allows "lookback" groups that scan backwards,
+// each node also gets a "direction". Normally the value of
+// boolean n.backward = false.
+//
+// During parsing, top-level nodes are also stacked onto a parse
+// stack (a stack of trees). For this purpose we have a n.next
+// pointer. [Note that to save a few bytes, we could overload the
+// n.parent pointer instead.]
+//
+// On the parse stack, each tree has a "role" - basically, the
+// nonterminal in the grammar that the parser has currently
+// assigned to the tree. That code is stored in n.role.
+//
+// Finally, some of the different kinds of nodes have data.
+// Two integers (for the looping constructs) are stored in
+// n.operands, an an object (either a string or a set)
+// is stored in n.data
+type regexNode struct {
+ t nodeType
+ children []*regexNode
+ str []rune
+ set *CharSet
+ ch rune
+ m int
+ n int
+ options RegexOptions
+ next *regexNode
+}
+
+type nodeType int32
+
+const (
+ // The following are leaves, and correspond to primitive operations
+
+ ntOnerep nodeType = 0 // lef,back char,min,max a {n}
+ ntNotonerep = 1 // lef,back char,min,max .{n}
+ ntSetrep = 2 // lef,back set,min,max [\d]{n}
+ ntOneloop = 3 // lef,back char,min,max a {,n}
+ ntNotoneloop = 4 // lef,back char,min,max .{,n}
+ ntSetloop = 5 // lef,back set,min,max [\d]{,n}
+ ntOnelazy = 6 // lef,back char,min,max a {,n}?
+ ntNotonelazy = 7 // lef,back char,min,max .{,n}?
+ ntSetlazy = 8 // lef,back set,min,max [\d]{,n}?
+ ntOne = 9 // lef char a
+ ntNotone = 10 // lef char [^a]
+ ntSet = 11 // lef set [a-z\s] \w \s \d
+ ntMulti = 12 // lef string abcd
+ ntRef = 13 // lef group \#
+ ntBol = 14 // ^
+ ntEol = 15 // $
+ ntBoundary = 16 // \b
+ ntNonboundary = 17 // \B
+ ntBeginning = 18 // \A
+ ntStart = 19 // \G
+ ntEndZ = 20 // \Z
+ ntEnd = 21 // \Z
+
+ // Interior nodes do not correspond to primitive operations, but
+ // control structures compositing other operations
+
+ // Concat and alternate take n children, and can run forward or backwards
+
+ ntNothing = 22 // []
+ ntEmpty = 23 // ()
+ ntAlternate = 24 // a|b
+ ntConcatenate = 25 // ab
+ ntLoop = 26 // m,x * + ? {,}
+ ntLazyloop = 27 // m,x *? +? ?? {,}?
+ ntCapture = 28 // n ()
+ ntGroup = 29 // (?:)
+ ntRequire = 30 // (?=) (?<=)
+ ntPrevent = 31 // (?!) (?<!)
+ ntGreedy = 32 // (?>) (?<)
+ ntTestref = 33 // (?(n) | )
+ ntTestgroup = 34 // (?(...) | )
+
+ ntECMABoundary = 41 // \b
+ ntNonECMABoundary = 42 // \B
+)
+
+func newRegexNode(t nodeType, opt RegexOptions) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ }
+}
+
+func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ ch: ch,
+ }
+}
+
+func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ str: str,
+ }
+}
+
+func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ set: set,
+ }
+}
+
+func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ m: m,
+ }
+}
+func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode {
+ return &regexNode{
+ t: t,
+ options: opt,
+ m: m,
+ n: n,
+ }
+}
+
+func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) {
+ for i := 0; i < len(n.str); i++ {
+ buf.WriteRune(n.str[i])
+ }
+}
+
+func (n *regexNode) addChild(child *regexNode) {
+ reduced := child.reduce()
+ n.children = append(n.children, reduced)
+ reduced.next = n
+}
+
+func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) {
+ newChildren := make([]*regexNode, 0, len(n.children)+len(nodes))
+ n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...)
+}
+
+// removes children including the start but not the end index
+func (n *regexNode) removeChildren(startIndex, endIndex int) {
+ n.children = append(n.children[:startIndex], n.children[endIndex:]...)
+}
+
+// Pass type as OneLazy or OneLoop
+func (n *regexNode) makeRep(t nodeType, min, max int) {
+ n.t += (t - ntOne)
+ n.m = min
+ n.n = max
+}
+
+func (n *regexNode) reduce() *regexNode {
+ switch n.t {
+ case ntAlternate:
+ return n.reduceAlternation()
+
+ case ntConcatenate:
+ return n.reduceConcatenation()
+
+ case ntLoop, ntLazyloop:
+ return n.reduceRep()
+
+ case ntGroup:
+ return n.reduceGroup()
+
+ case ntSet, ntSetloop:
+ return n.reduceSet()
+
+ default:
+ return n
+ }
+}
+
+// Basic optimization. Single-letter alternations can be replaced
+// by faster set specifications, and nested alternations with no
+// intervening operators can be flattened:
+//
+// a|b|c|def|g|h -> [a-c]|def|[gh]
+// apple|(?:orange|pear)|grape -> apple|orange|pear|grape
+func (n *regexNode) reduceAlternation() *regexNode {
+ if len(n.children) == 0 {
+ return newRegexNode(ntNothing, n.options)
+ }
+
+ wasLastSet := false
+ lastNodeCannotMerge := false
+ var optionsLast RegexOptions
+ var i, j int
+
+ for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
+ at := n.children[i]
+
+ if j < i {
+ n.children[j] = at
+ }
+
+ for {
+ if at.t == ntAlternate {
+ for k := 0; k < len(at.children); k++ {
+ at.children[k].next = n
+ }
+ n.insertChildren(i+1, at.children)
+
+ j--
+ } else if at.t == ntSet || at.t == ntOne {
+ // Cannot merge sets if L or I options differ, or if either are negated.
+ optionsAt := at.options & (RightToLeft | IgnoreCase)
+
+ if at.t == ntSet {
+ if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() {
+ wasLastSet = true
+ lastNodeCannotMerge = !at.set.IsMergeable()
+ optionsLast = optionsAt
+ break
+ }
+ } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge {
+ wasLastSet = true
+ lastNodeCannotMerge = false
+ optionsLast = optionsAt
+ break
+ }
+
+ // The last node was a Set or a One, we're a Set or One and our options are the same.
+ // Merge the two nodes.
+ j--
+ prev := n.children[j]
+
+ var prevCharClass *CharSet
+ if prev.t == ntOne {
+ prevCharClass = &CharSet{}
+ prevCharClass.addChar(prev.ch)
+ } else {
+ prevCharClass = prev.set
+ }
+
+ if at.t == ntOne {
+ prevCharClass.addChar(at.ch)
+ } else {
+ prevCharClass.addSet(*at.set)
+ }
+
+ prev.t = ntSet
+ prev.set = prevCharClass
+ } else if at.t == ntNothing {
+ j--
+ } else {
+ wasLastSet = false
+ lastNodeCannotMerge = false
+ }
+ break
+ }
+ }
+
+ if j < i {
+ n.removeChildren(j, i)
+ }
+
+ return n.stripEnation(ntNothing)
+}
+
+// Basic optimization. Adjacent strings can be concatenated.
+//
+// (?:abc)(?:def) -> abcdef
+func (n *regexNode) reduceConcatenation() *regexNode {
+ // Eliminate empties and concat adjacent strings/chars
+
+ var optionsLast RegexOptions
+ var optionsAt RegexOptions
+ var i, j int
+
+ if len(n.children) == 0 {
+ return newRegexNode(ntEmpty, n.options)
+ }
+
+ wasLastString := false
+
+ for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 {
+ var at, prev *regexNode
+
+ at = n.children[i]
+
+ if j < i {
+ n.children[j] = at
+ }
+
+ if at.t == ntConcatenate &&
+ ((at.options & RightToLeft) == (n.options & RightToLeft)) {
+ for k := 0; k < len(at.children); k++ {
+ at.children[k].next = n
+ }
+
+ //insert at.children at i+1 index in n.children
+ n.insertChildren(i+1, at.children)
+
+ j--
+ } else if at.t == ntMulti || at.t == ntOne {
+ // Cannot merge strings if L or I options differ
+ optionsAt = at.options & (RightToLeft | IgnoreCase)
+
+ if !wasLastString || optionsLast != optionsAt {
+ wasLastString = true
+ optionsLast = optionsAt
+ continue
+ }
+
+ j--
+ prev = n.children[j]
+
+ if prev.t == ntOne {
+ prev.t = ntMulti
+ prev.str = []rune{prev.ch}
+ }
+
+ if (optionsAt & RightToLeft) == 0 {
+ if at.t == ntOne {
+ prev.str = append(prev.str, at.ch)
+ } else {
+ prev.str = append(prev.str, at.str...)
+ }
+ } else {
+ if at.t == ntOne {
+ // insert at the front by expanding our slice, copying the data over, and then setting the value
+ prev.str = append(prev.str, 0)
+ copy(prev.str[1:], prev.str)
+ prev.str[0] = at.ch
+ } else {
+ //insert at the front...this one we'll make a new slice and copy both into it
+ merge := make([]rune, len(prev.str)+len(at.str))
+ copy(merge, at.str)
+ copy(merge[len(at.str):], prev.str)
+ prev.str = merge
+ }
+ }
+ } else if at.t == ntEmpty {
+ j--
+ } else {
+ wasLastString = false
+ }
+ }
+
+ if j < i {
+ // remove indices j through i from the children
+ n.removeChildren(j, i)
+ }
+
+ return n.stripEnation(ntEmpty)
+}
+
+// Nested repeaters just get multiplied with each other if they're not
+// too lumpy
+func (n *regexNode) reduceRep() *regexNode {
+
+ u := n
+ t := n.t
+ min := n.m
+ max := n.n
+
+ for {
+ if len(u.children) == 0 {
+ break
+ }
+
+ child := u.children[0]
+
+ // multiply reps of the same type only
+ if child.t != t {
+ childType := child.t
+
+ if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop ||
+ childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) {
+ break
+ }
+ }
+
+ // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
+ // [but things like (a {2,})+ are not too lumpy...]
+ if u.m == 0 && child.m > 1 || child.n < child.m*2 {
+ break
+ }
+
+ u = child
+ if u.m > 0 {
+ if (math.MaxInt32-1)/u.m < min {
+ u.m = math.MaxInt32
+ } else {
+ u.m = u.m * min
+ }
+ }
+ if u.n > 0 {
+ if (math.MaxInt32-1)/u.n < max {
+ u.n = math.MaxInt32
+ } else {
+ u.n = u.n * max
+ }
+ }
+ }
+
+ if math.MaxInt32 == min {
+ return newRegexNode(ntNothing, n.options)
+ }
+ return u
+
+}
+
+// Simple optimization. If a concatenation or alternation has only
+// one child strip out the intermediate node. If it has zero children,
+// turn it into an empty.
+func (n *regexNode) stripEnation(emptyType nodeType) *regexNode {
+ switch len(n.children) {
+ case 0:
+ return newRegexNode(emptyType, n.options)
+ case 1:
+ return n.children[0]
+ default:
+ return n
+ }
+}
+
+func (n *regexNode) reduceGroup() *regexNode {
+ u := n
+
+ for u.t == ntGroup {
+ u = u.children[0]
+ }
+
+ return u
+}
+
+// Simple optimization. If a set is a singleton, an inverse singleton,
+// or empty, it's transformed accordingly.
+func (n *regexNode) reduceSet() *regexNode {
+ // Extract empty-set, one and not-one case as special
+
+ if n.set == nil {
+ n.t = ntNothing
+ } else if n.set.IsSingleton() {
+ n.ch = n.set.SingletonChar()
+ n.set = nil
+ n.t += (ntOne - ntSet)
+ } else if n.set.IsSingletonInverse() {
+ n.ch = n.set.SingletonChar()
+ n.set = nil
+ n.t += (ntNotone - ntSet)
+ }
+
+ return n
+}
+
+func (n *regexNode) reverseLeft() *regexNode {
+ if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 {
+ //reverse children order
+ for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 {
+ n.children[left], n.children[right] = n.children[right], n.children[left]
+ }
+ }
+
+ return n
+}
+
+func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode {
+ if min == 0 && max == 0 {
+ return newRegexNode(ntEmpty, n.options)
+ }
+
+ if min == 1 && max == 1 {
+ return n
+ }
+
+ switch n.t {
+ case ntOne, ntNotone, ntSet:
+ if lazy {
+ n.makeRep(Onelazy, min, max)
+ } else {
+ n.makeRep(Oneloop, min, max)
+ }
+ return n
+
+ default:
+ var t nodeType
+ if lazy {
+ t = ntLazyloop
+ } else {
+ t = ntLoop
+ }
+ result := newRegexNodeMN(t, n.options, min, max)
+ result.addChild(n)
+ return result
+ }
+}
+
+// debug functions
+
+var typeStr = []string{
+ "Onerep", "Notonerep", "Setrep",
+ "Oneloop", "Notoneloop", "Setloop",
+ "Onelazy", "Notonelazy", "Setlazy",
+ "One", "Notone", "Set",
+ "Multi", "Ref",
+ "Bol", "Eol", "Boundary", "Nonboundary",
+ "Beginning", "Start", "EndZ", "End",
+ "Nothing", "Empty",
+ "Alternate", "Concatenate",
+ "Loop", "Lazyloop",
+ "Capture", "Group", "Require", "Prevent", "Greedy",
+ "Testref", "Testgroup",
+ "Unknown", "Unknown", "Unknown",
+ "Unknown", "Unknown", "Unknown",
+ "ECMABoundary", "NonECMABoundary",
+}
+
+func (n *regexNode) description() string {
+ buf := &bytes.Buffer{}
+
+ buf.WriteString(typeStr[n.t])
+
+ if (n.options & ExplicitCapture) != 0 {
+ buf.WriteString("-C")
+ }
+ if (n.options & IgnoreCase) != 0 {
+ buf.WriteString("-I")
+ }
+ if (n.options & RightToLeft) != 0 {
+ buf.WriteString("-L")
+ }
+ if (n.options & Multiline) != 0 {
+ buf.WriteString("-M")
+ }
+ if (n.options & Singleline) != 0 {
+ buf.WriteString("-S")
+ }
+ if (n.options & IgnorePatternWhitespace) != 0 {
+ buf.WriteString("-X")
+ }
+ if (n.options & ECMAScript) != 0 {
+ buf.WriteString("-E")
+ }
+
+ switch n.t {
+ case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone:
+ buf.WriteString("(Ch = " + CharDescription(n.ch) + ")")
+ break
+ case ntCapture:
+ buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")")
+ break
+ case ntRef, ntTestref:
+ buf.WriteString("(index = " + strconv.Itoa(n.m) + ")")
+ break
+ case ntMulti:
+ fmt.Fprintf(buf, "(String = %s)", string(n.str))
+ break
+ case ntSet, ntSetloop, ntSetlazy:
+ buf.WriteString("(Set = " + n.set.String() + ")")
+ break
+ }
+
+ switch n.t {
+ case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop:
+ buf.WriteString("(Min = ")
+ buf.WriteString(strconv.Itoa(n.m))
+ buf.WriteString(", Max = ")
+ if n.n == math.MaxInt32 {
+ buf.WriteString("inf")
+ } else {
+ buf.WriteString(strconv.Itoa(n.n))
+ }
+ buf.WriteString(")")
+
+ break
+ }
+
+ return buf.String()
+}
+
+var padSpace = []byte(" ")
+
+func (t *RegexTree) Dump() string {
+ return t.root.dump()
+}
+
+func (n *regexNode) dump() string {
+ var stack []int
+ CurNode := n
+ CurChild := 0
+
+ buf := bytes.NewBufferString(CurNode.description())
+ buf.WriteRune('\n')
+
+ for {
+ if CurNode.children != nil && CurChild < len(CurNode.children) {
+ stack = append(stack, CurChild+1)
+ CurNode = CurNode.children[CurChild]
+ CurChild = 0
+
+ Depth := len(stack)
+ if Depth > 32 {
+ Depth = 32
+ }
+ buf.Write(padSpace[:Depth])
+ buf.WriteString(CurNode.description())
+ buf.WriteRune('\n')
+ } else {
+ if len(stack) == 0 {
+ break
+ }
+
+ CurChild = stack[len(stack)-1]
+ stack = stack[:len(stack)-1]
+ CurNode = CurNode.next
+ }
+ }
+ return buf.String()
+}