1/*
  2Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
  3more feature full regex engine behind the scenes.
  4
  5It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
  6You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
  7need to write very complex patterns or require compatibility with .NET.
  8*/
  9package regexp2
 10
 11import (
 12	"errors"
 13	"math"
 14	"strconv"
 15	"sync"
 16	"time"
 17
 18	"github.com/dlclark/regexp2/syntax"
 19)
 20
 21// Default timeout used when running regexp matches -- "forever"
 22var DefaultMatchTimeout = time.Duration(math.MaxInt64)
 23
 24// Regexp is the representation of a compiled regular expression.
 25// A Regexp is safe for concurrent use by multiple goroutines.
 26type Regexp struct {
 27	//timeout when trying to find matches
 28	MatchTimeout time.Duration
 29
 30	// read-only after Compile
 31	pattern string       // as passed to Compile
 32	options RegexOptions // options
 33
 34	caps     map[int]int    // capnum->index
 35	capnames map[string]int //capture group name -> index
 36	capslist []string       //sorted list of capture group names
 37	capsize  int            // size of the capture array
 38
 39	code *syntax.Code // compiled program
 40
 41	// cache of machines for running regexp
 42	muRun  sync.Mutex
 43	runner []*runner
 44}
 45
 46// Compile parses a regular expression and returns, if successful,
 47// a Regexp object that can be used to match against text.
 48func Compile(expr string, opt RegexOptions) (*Regexp, error) {
 49	// parse it
 50	tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
 51	if err != nil {
 52		return nil, err
 53	}
 54
 55	// translate it to code
 56	code, err := syntax.Write(tree)
 57	if err != nil {
 58		return nil, err
 59	}
 60
 61	// return it
 62	return &Regexp{
 63		pattern:      expr,
 64		options:      opt,
 65		caps:         code.Caps,
 66		capnames:     tree.Capnames,
 67		capslist:     tree.Caplist,
 68		capsize:      code.Capsize,
 69		code:         code,
 70		MatchTimeout: DefaultMatchTimeout,
 71	}, nil
 72}
 73
 74// MustCompile is like Compile but panics if the expression cannot be parsed.
 75// It simplifies safe initialization of global variables holding compiled regular
 76// expressions.
 77func MustCompile(str string, opt RegexOptions) *Regexp {
 78	regexp, error := Compile(str, opt)
 79	if error != nil {
 80		panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
 81	}
 82	return regexp
 83}
 84
 85// Escape adds backslashes to any special characters in the input string
 86func Escape(input string) string {
 87	return syntax.Escape(input)
 88}
 89
 90// Unescape removes any backslashes from previously-escaped special characters in the input string
 91func Unescape(input string) (string, error) {
 92	return syntax.Unescape(input)
 93}
 94
 95// String returns the source text used to compile the regular expression.
 96func (re *Regexp) String() string {
 97	return re.pattern
 98}
 99
100func quote(s string) string {
101	if strconv.CanBackquote(s) {
102		return "`" + s + "`"
103	}
104	return strconv.Quote(s)
105}
106
107// RegexOptions impact the runtime and parsing behavior
108// for each specific regex.  They are setable in code as well
109// as in the regex pattern itself.
110type RegexOptions int32
111
112const (
113	None                    RegexOptions = 0x0
114	IgnoreCase                           = 0x0001 // "i"
115	Multiline                            = 0x0002 // "m"
116	ExplicitCapture                      = 0x0004 // "n"
117	Compiled                             = 0x0008 // "c"
118	Singleline                           = 0x0010 // "s"
119	IgnorePatternWhitespace              = 0x0020 // "x"
120	RightToLeft                          = 0x0040 // "r"
121	Debug                                = 0x0080 // "d"
122	ECMAScript                           = 0x0100 // "e"
123	RE2                                  = 0x0200 // RE2 (regexp package) compatibility mode
124	Unicode                              = 0x0400 // "u"
125)
126
127func (re *Regexp) RightToLeft() bool {
128	return re.options&RightToLeft != 0
129}
130
131func (re *Regexp) Debug() bool {
132	return re.options&Debug != 0
133}
134
135// Replace searches the input string and replaces each match found with the replacement text.
136// Count will limit the number of matches attempted and startAt will allow
137// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
138// Set startAt and count to -1 to go through the whole string
139func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
140	data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
141	if err != nil {
142		return "", err
143	}
144	//TODO: cache ReplacerData
145
146	return replace(re, data, nil, input, startAt, count)
147}
148
149// ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
150// Count will limit the number of matches attempted and startAt will allow
151// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
152// Set startAt and count to -1 to go through the whole string.
153func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
154	return replace(re, nil, evaluator, input, startAt, count)
155}
156
157// FindStringMatch searches the input string for a Regexp match
158func (re *Regexp) FindStringMatch(s string) (*Match, error) {
159	// convert string to runes
160	return re.run(false, -1, getRunes(s))
161}
162
163// FindRunesMatch searches the input rune slice for a Regexp match
164func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
165	return re.run(false, -1, r)
166}
167
168// FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
169func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
170	if startAt > len(s) {
171		return nil, errors.New("startAt must be less than the length of the input string")
172	}
173	r, startAt := re.getRunesAndStart(s, startAt)
174	if startAt == -1 {
175		// we didn't find our start index in the string -- that's a problem
176		return nil, errors.New("startAt must align to the start of a valid rune in the input string")
177	}
178
179	return re.run(false, startAt, r)
180}
181
182// FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
183func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
184	return re.run(false, startAt, r)
185}
186
187// FindNextMatch returns the next match in the same input string as the match parameter.
188// Will return nil if there is no next match or if given a nil match.
189func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
190	if m == nil {
191		return nil, nil
192	}
193
194	// If previous match was empty, advance by one before matching to prevent
195	// infinite loop
196	startAt := m.textpos
197	if m.Length == 0 {
198		if m.textpos == len(m.text) {
199			return nil, nil
200		}
201
202		if re.RightToLeft() {
203			startAt--
204		} else {
205			startAt++
206		}
207	}
208	return re.run(false, startAt, m.text)
209}
210
211// MatchString return true if the string matches the regex
212// error will be set if a timeout occurs
213func (re *Regexp) MatchString(s string) (bool, error) {
214	m, err := re.run(true, -1, getRunes(s))
215	if err != nil {
216		return false, err
217	}
218	return m != nil, nil
219}
220
221func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
222	if startAt < 0 {
223		if re.RightToLeft() {
224			r := getRunes(s)
225			return r, len(r)
226		}
227		return getRunes(s), 0
228	}
229	ret := make([]rune, len(s))
230	i := 0
231	runeIdx := -1
232	for strIdx, r := range s {
233		if strIdx == startAt {
234			runeIdx = i
235		}
236		ret[i] = r
237		i++
238	}
239	if startAt == len(s) {
240		runeIdx = i
241	}
242	return ret[:i], runeIdx
243}
244
245func getRunes(s string) []rune {
246	return []rune(s)
247}
248
249// MatchRunes return true if the runes matches the regex
250// error will be set if a timeout occurs
251func (re *Regexp) MatchRunes(r []rune) (bool, error) {
252	m, err := re.run(true, -1, r)
253	if err != nil {
254		return false, err
255	}
256	return m != nil, nil
257}
258
259// GetGroupNames Returns the set of strings used to name capturing groups in the expression.
260func (re *Regexp) GetGroupNames() []string {
261	var result []string
262
263	if re.capslist == nil {
264		result = make([]string, re.capsize)
265
266		for i := 0; i < len(result); i++ {
267			result[i] = strconv.Itoa(i)
268		}
269	} else {
270		result = make([]string, len(re.capslist))
271		copy(result, re.capslist)
272	}
273
274	return result
275}
276
277// GetGroupNumbers returns the integer group numbers corresponding to a group name.
278func (re *Regexp) GetGroupNumbers() []int {
279	var result []int
280
281	if re.caps == nil {
282		result = make([]int, re.capsize)
283
284		for i := 0; i < len(result); i++ {
285			result[i] = i
286		}
287	} else {
288		result = make([]int, len(re.caps))
289
290		for k, v := range re.caps {
291			result[v] = k
292		}
293	}
294
295	return result
296}
297
298// GroupNameFromNumber retrieves a group name that corresponds to a group number.
299// It will return "" for and unknown group number.  Unnamed groups automatically
300// receive a name that is the decimal string equivalent of its number.
301func (re *Regexp) GroupNameFromNumber(i int) string {
302	if re.capslist == nil {
303		if i >= 0 && i < re.capsize {
304			return strconv.Itoa(i)
305		}
306
307		return ""
308	}
309
310	if re.caps != nil {
311		var ok bool
312		if i, ok = re.caps[i]; !ok {
313			return ""
314		}
315	}
316
317	if i >= 0 && i < len(re.capslist) {
318		return re.capslist[i]
319	}
320
321	return ""
322}
323
324// GroupNumberFromName returns a group number that corresponds to a group name.
325// Returns -1 if the name is not a recognized group name.  Numbered groups
326// automatically get a group name that is the decimal string equivalent of its number.
327func (re *Regexp) GroupNumberFromName(name string) int {
328	// look up name if we have a hashtable of names
329	if re.capnames != nil {
330		if k, ok := re.capnames[name]; ok {
331			return k
332		}
333
334		return -1
335	}
336
337	// convert to an int if it looks like a number
338	result := 0
339	for i := 0; i < len(name); i++ {
340		ch := name[i]
341
342		if ch > '9' || ch < '0' {
343			return -1
344		}
345
346		result *= 10
347		result += int(ch - '0')
348	}
349
350	// return int if it's in range
351	if result >= 0 && result < re.capsize {
352		return result
353	}
354
355	return -1
356}