summaryrefslogtreecommitdiff
path: root/vendor/github.com/dlclark/regexp2/syntax/writer.go
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
committerMitja Felicijan <mitja.felicijan@gmail.com>2024-10-25 00:47:47 +0200
commitc6cc0108ca7738023b45e0eeac0fa2390532dd93 (patch)
tree36890e6cd3091bbab8efbe686cc56f467f645bfd /vendor/github.com/dlclark/regexp2/syntax/writer.go
parent0130404a1dc663d4aa68d780c9bcb23a4243e68d (diff)
downloadjbmafp-master.tar.gz
Added vendor lock on depsHEADmaster
Diffstat (limited to 'vendor/github.com/dlclark/regexp2/syntax/writer.go')
-rw-r--r--vendor/github.com/dlclark/regexp2/syntax/writer.go500
1 files changed, 500 insertions, 0 deletions
diff --git a/vendor/github.com/dlclark/regexp2/syntax/writer.go b/vendor/github.com/dlclark/regexp2/syntax/writer.go
new file mode 100644
index 0000000..a5aa11c
--- /dev/null
+++ b/vendor/github.com/dlclark/regexp2/syntax/writer.go
@@ -0,0 +1,500 @@
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "math"
+ "os"
+)
+
+func Write(tree *RegexTree) (*Code, error) {
+ w := writer{
+ intStack: make([]int, 0, 32),
+ emitted: make([]int, 2),
+ stringhash: make(map[string]int),
+ sethash: make(map[string]int),
+ }
+
+ code, err := w.codeFromTree(tree)
+
+ if tree.options&Debug > 0 && code != nil {
+ os.Stdout.WriteString(code.Dump())
+ os.Stdout.WriteString("\n")
+ }
+
+ return code, err
+}
+
+type writer struct {
+ emitted []int
+
+ intStack []int
+ curpos int
+ stringhash map[string]int
+ stringtable [][]rune
+ sethash map[string]int
+ settable []*CharSet
+ counting bool
+ count int
+ trackcount int
+ caps map[int]int
+}
+
+const (
+ beforeChild nodeType = 64
+ afterChild = 128
+ //MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
+ MaxPrefixSize = 50
+)
+
+// The top level RegexCode generator. It does a depth-first walk
+// through the tree and calls EmitFragment to emits code before
+// and after each child of an interior node, and at each leaf.
+//
+// It runs two passes, first to count the size of the generated
+// code, and second to generate the code.
+//
+// We should time it against the alternative, which is
+// to just generate the code and grow the array as we go.
+func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
+ var (
+ curNode *regexNode
+ curChild int
+ capsize int
+ )
+ // construct sparse capnum mapping if some numbers are unused
+
+ if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
+ capsize = tree.captop
+ w.caps = nil
+ } else {
+ capsize = len(tree.capnumlist)
+ w.caps = tree.caps
+ for i := 0; i < len(tree.capnumlist); i++ {
+ w.caps[tree.capnumlist[i]] = i
+ }
+ }
+
+ w.counting = true
+
+ for {
+ if !w.counting {
+ w.emitted = make([]int, w.count)
+ }
+
+ curNode = tree.root
+ curChild = 0
+
+ w.emit1(Lazybranch, 0)
+
+ for {
+ if len(curNode.children) == 0 {
+ w.emitFragment(curNode.t, curNode, 0)
+ } else if curChild < len(curNode.children) {
+ w.emitFragment(curNode.t|beforeChild, curNode, curChild)
+
+ curNode = curNode.children[curChild]
+
+ w.pushInt(curChild)
+ curChild = 0
+ continue
+ }
+
+ if w.emptyStack() {
+ break
+ }
+
+ curChild = w.popInt()
+ curNode = curNode.next
+
+ w.emitFragment(curNode.t|afterChild, curNode, curChild)
+ curChild++
+ }
+
+ w.patchJump(0, w.curPos())
+ w.emit(Stop)
+
+ if !w.counting {
+ break
+ }
+
+ w.counting = false
+ }
+
+ fcPrefix := getFirstCharsPrefix(tree)
+ prefix := getPrefix(tree)
+ rtl := (tree.options & RightToLeft) != 0
+
+ var bmPrefix *BmPrefix
+ //TODO: benchmark string prefixes
+ if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
+ if len(prefix.PrefixStr) > MaxPrefixSize {
+ // limit prefix changes to 10k
+ prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
+ }
+ bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
+ } else {
+ bmPrefix = nil
+ }
+
+ return &Code{
+ Codes: w.emitted,
+ Strings: w.stringtable,
+ Sets: w.settable,
+ TrackCount: w.trackcount,
+ Caps: w.caps,
+ Capsize: capsize,
+ FcPrefix: fcPrefix,
+ BmPrefix: bmPrefix,
+ Anchors: getAnchors(tree),
+ RightToLeft: rtl,
+ }, nil
+}
+
+// The main RegexCode generator. It does a depth-first walk
+// through the tree and calls EmitFragment to emits code before
+// and after each child of an interior node, and at each leaf.
+func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
+ bits := InstOp(0)
+
+ if nodetype <= ntRef {
+ if (node.options & RightToLeft) != 0 {
+ bits |= Rtl
+ }
+ if (node.options & IgnoreCase) != 0 {
+ bits |= Ci
+ }
+ }
+ ntBits := nodeType(bits)
+
+ switch nodetype {
+ case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
+ break
+
+ case ntAlternate | beforeChild:
+ if curIndex < len(node.children)-1 {
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+ }
+
+ case ntAlternate | afterChild:
+ if curIndex < len(node.children)-1 {
+ lbPos := w.popInt()
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ w.patchJump(lbPos, w.curPos())
+ } else {
+ for i := 0; i < curIndex; i++ {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+ }
+ break
+
+ case ntTestref | beforeChild:
+ if curIndex == 0 {
+ w.emit(Setjump)
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+ w.emit1(Testref, w.mapCapnum(node.m))
+ w.emit(Forejump)
+ }
+
+ case ntTestref | afterChild:
+ if curIndex == 0 {
+ branchpos := w.popInt()
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ w.patchJump(branchpos, w.curPos())
+ w.emit(Forejump)
+ if len(node.children) <= 1 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+ } else if curIndex == 1 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+
+ case ntTestgroup | beforeChild:
+ if curIndex == 0 {
+ w.emit(Setjump)
+ w.emit(Setmark)
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+ }
+
+ case ntTestgroup | afterChild:
+ if curIndex == 0 {
+ w.emit(Getmark)
+ w.emit(Forejump)
+ } else if curIndex == 1 {
+ Branchpos := w.popInt()
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ w.patchJump(Branchpos, w.curPos())
+ w.emit(Getmark)
+ w.emit(Forejump)
+ if len(node.children) <= 2 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+ } else if curIndex == 2 {
+ w.patchJump(w.popInt(), w.curPos())
+ }
+
+ case ntLoop | beforeChild, ntLazyloop | beforeChild:
+
+ if node.n < math.MaxInt32 || node.m > 1 {
+ if node.m == 0 {
+ w.emit1(Nullcount, 0)
+ } else {
+ w.emit1(Setcount, 1-node.m)
+ }
+ } else if node.m == 0 {
+ w.emit(Nullmark)
+ } else {
+ w.emit(Setmark)
+ }
+
+ if node.m == 0 {
+ w.pushInt(w.curPos())
+ w.emit1(Goto, 0)
+ }
+ w.pushInt(w.curPos())
+
+ case ntLoop | afterChild, ntLazyloop | afterChild:
+
+ startJumpPos := w.curPos()
+ lazy := (nodetype - (ntLoop | afterChild))
+
+ if node.n < math.MaxInt32 || node.m > 1 {
+ if node.n == math.MaxInt32 {
+ w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
+ } else {
+ w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
+ }
+ } else {
+ w.emit1(InstOp(Branchmark+lazy), w.popInt())
+ }
+
+ if node.m == 0 {
+ w.patchJump(w.popInt(), startJumpPos)
+ }
+
+ case ntGroup | beforeChild, ntGroup | afterChild:
+
+ case ntCapture | beforeChild:
+ w.emit(Setmark)
+
+ case ntCapture | afterChild:
+ w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
+
+ case ntRequire | beforeChild:
+ // NOTE: the following line causes lookahead/lookbehind to be
+ // NON-BACKTRACKING. It can be commented out with (*)
+ w.emit(Setjump)
+
+ w.emit(Setmark)
+
+ case ntRequire | afterChild:
+ w.emit(Getmark)
+
+ // NOTE: the following line causes lookahead/lookbehind to be
+ // NON-BACKTRACKING. It can be commented out with (*)
+ w.emit(Forejump)
+
+ case ntPrevent | beforeChild:
+ w.emit(Setjump)
+ w.pushInt(w.curPos())
+ w.emit1(Lazybranch, 0)
+
+ case ntPrevent | afterChild:
+ w.emit(Backjump)
+ w.patchJump(w.popInt(), w.curPos())
+ w.emit(Forejump)
+
+ case ntGreedy | beforeChild:
+ w.emit(Setjump)
+
+ case ntGreedy | afterChild:
+ w.emit(Forejump)
+
+ case ntOne, ntNotone:
+ w.emit1(InstOp(node.t|ntBits), int(node.ch))
+
+ case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
+ if node.m > 0 {
+ if node.t == ntOneloop || node.t == ntOnelazy {
+ w.emit2(Onerep|bits, int(node.ch), node.m)
+ } else {
+ w.emit2(Notonerep|bits, int(node.ch), node.m)
+ }
+ }
+ if node.n > node.m {
+ if node.n == math.MaxInt32 {
+ w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
+ } else {
+ w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
+ }
+ }
+
+ case ntSetloop, ntSetlazy:
+ if node.m > 0 {
+ w.emit2(Setrep|bits, w.setCode(node.set), node.m)
+ }
+ if node.n > node.m {
+ if node.n == math.MaxInt32 {
+ w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
+ } else {
+ w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
+ }
+ }
+
+ case ntMulti:
+ w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
+
+ case ntSet:
+ w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
+
+ case ntRef:
+ w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
+
+ case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
+ w.emit(InstOp(node.t))
+
+ default:
+ return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
+ }
+
+ return nil
+}
+
+// To avoid recursion, we use a simple integer stack.
+// This is the push.
+func (w *writer) pushInt(i int) {
+ w.intStack = append(w.intStack, i)
+}
+
+// Returns true if the stack is empty.
+func (w *writer) emptyStack() bool {
+ return len(w.intStack) == 0
+}
+
+// This is the pop.
+func (w *writer) popInt() int {
+ //get our item
+ idx := len(w.intStack) - 1
+ i := w.intStack[idx]
+ //trim our slice
+ w.intStack = w.intStack[:idx]
+ return i
+}
+
+// Returns the current position in the emitted code.
+func (w *writer) curPos() int {
+ return w.curpos
+}
+
+// Fixes up a jump instruction at the specified offset
+// so that it jumps to the specified jumpDest.
+func (w *writer) patchJump(offset, jumpDest int) {
+ w.emitted[offset+1] = jumpDest
+}
+
+// Returns an index in the set table for a charset
+// uses a map to eliminate duplicates.
+func (w *writer) setCode(set *CharSet) int {
+ if w.counting {
+ return 0
+ }
+
+ buf := &bytes.Buffer{}
+
+ set.mapHashFill(buf)
+ hash := buf.String()
+ i, ok := w.sethash[hash]
+ if !ok {
+ i = len(w.sethash)
+ w.sethash[hash] = i
+ w.settable = append(w.settable, set)
+ }
+ return i
+}
+
+// Returns an index in the string table for a string.
+// uses a map to eliminate duplicates.
+func (w *writer) stringCode(str []rune) int {
+ if w.counting {
+ return 0
+ }
+
+ hash := string(str)
+ i, ok := w.stringhash[hash]
+ if !ok {
+ i = len(w.stringhash)
+ w.stringhash[hash] = i
+ w.stringtable = append(w.stringtable, str)
+ }
+
+ return i
+}
+
+// When generating code on a regex that uses a sparse set
+// of capture slots, we hash them to a dense set of indices
+// for an array of capture slots. Instead of doing the hash
+// at match time, it's done at compile time, here.
+func (w *writer) mapCapnum(capnum int) int {
+ if capnum == -1 {
+ return -1
+ }
+
+ if w.caps != nil {
+ return w.caps[capnum]
+ }
+
+ return capnum
+}
+
+// Emits a zero-argument operation. Note that the emit
+// functions all run in two modes: they can emit code, or
+// they can just count the size of the code.
+func (w *writer) emit(op InstOp) {
+ if w.counting {
+ w.count++
+ if opcodeBacktracks(op) {
+ w.trackcount++
+ }
+ return
+ }
+ w.emitted[w.curpos] = int(op)
+ w.curpos++
+}
+
+// Emits a one-argument operation.
+func (w *writer) emit1(op InstOp, opd1 int) {
+ if w.counting {
+ w.count += 2
+ if opcodeBacktracks(op) {
+ w.trackcount++
+ }
+ return
+ }
+ w.emitted[w.curpos] = int(op)
+ w.curpos++
+ w.emitted[w.curpos] = opd1
+ w.curpos++
+}
+
+// Emits a two-argument operation.
+func (w *writer) emit2(op InstOp, opd1, opd2 int) {
+ if w.counting {
+ w.count += 3
+ if opcodeBacktracks(op) {
+ w.trackcount++
+ }
+ return
+ }
+ w.emitted[w.curpos] = int(op)
+ w.curpos++
+ w.emitted[w.curpos] = opd1
+ w.curpos++
+ w.emitted[w.curpos] = opd2
+ w.curpos++
+}