1/*
2Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a
3more feature full regex engine behind the scenes.
4
5It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET.
6You'll likely be better off with the RE2 engine from the regexp package and should only use this if you
7need to write very complex patterns or require compatibility with .NET.
8*/
9package regexp2
10
11import (
12 "errors"
13 "math"
14 "strconv"
15 "sync"
16 "time"
17
18 "github.com/dlclark/regexp2/syntax"
19)
20
21// Default timeout used when running regexp matches -- "forever"
22var DefaultMatchTimeout = time.Duration(math.MaxInt64)
23
24// Regexp is the representation of a compiled regular expression.
25// A Regexp is safe for concurrent use by multiple goroutines.
26type Regexp struct {
27 //timeout when trying to find matches
28 MatchTimeout time.Duration
29
30 // read-only after Compile
31 pattern string // as passed to Compile
32 options RegexOptions // options
33
34 caps map[int]int // capnum->index
35 capnames map[string]int //capture group name -> index
36 capslist []string //sorted list of capture group names
37 capsize int // size of the capture array
38
39 code *syntax.Code // compiled program
40
41 // cache of machines for running regexp
42 muRun sync.Mutex
43 runner []*runner
44}
45
46// Compile parses a regular expression and returns, if successful,
47// a Regexp object that can be used to match against text.
48func Compile(expr string, opt RegexOptions) (*Regexp, error) {
49 // parse it
50 tree, err := syntax.Parse(expr, syntax.RegexOptions(opt))
51 if err != nil {
52 return nil, err
53 }
54
55 // translate it to code
56 code, err := syntax.Write(tree)
57 if err != nil {
58 return nil, err
59 }
60
61 // return it
62 return &Regexp{
63 pattern: expr,
64 options: opt,
65 caps: code.Caps,
66 capnames: tree.Capnames,
67 capslist: tree.Caplist,
68 capsize: code.Capsize,
69 code: code,
70 MatchTimeout: DefaultMatchTimeout,
71 }, nil
72}
73
74// MustCompile is like Compile but panics if the expression cannot be parsed.
75// It simplifies safe initialization of global variables holding compiled regular
76// expressions.
77func MustCompile(str string, opt RegexOptions) *Regexp {
78 regexp, error := Compile(str, opt)
79 if error != nil {
80 panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error())
81 }
82 return regexp
83}
84
85// Escape adds backslashes to any special characters in the input string
86func Escape(input string) string {
87 return syntax.Escape(input)
88}
89
90// Unescape removes any backslashes from previously-escaped special characters in the input string
91func Unescape(input string) (string, error) {
92 return syntax.Unescape(input)
93}
94
95// String returns the source text used to compile the regular expression.
96func (re *Regexp) String() string {
97 return re.pattern
98}
99
100func quote(s string) string {
101 if strconv.CanBackquote(s) {
102 return "`" + s + "`"
103 }
104 return strconv.Quote(s)
105}
106
107// RegexOptions impact the runtime and parsing behavior
108// for each specific regex. They are setable in code as well
109// as in the regex pattern itself.
110type RegexOptions int32
111
112const (
113 None RegexOptions = 0x0
114 IgnoreCase = 0x0001 // "i"
115 Multiline = 0x0002 // "m"
116 ExplicitCapture = 0x0004 // "n"
117 Compiled = 0x0008 // "c"
118 Singleline = 0x0010 // "s"
119 IgnorePatternWhitespace = 0x0020 // "x"
120 RightToLeft = 0x0040 // "r"
121 Debug = 0x0080 // "d"
122 ECMAScript = 0x0100 // "e"
123 RE2 = 0x0200 // RE2 (regexp package) compatibility mode
124 Unicode = 0x0400 // "u"
125)
126
127func (re *Regexp) RightToLeft() bool {
128 return re.options&RightToLeft != 0
129}
130
131func (re *Regexp) Debug() bool {
132 return re.options&Debug != 0
133}
134
135// Replace searches the input string and replaces each match found with the replacement text.
136// Count will limit the number of matches attempted and startAt will allow
137// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
138// Set startAt and count to -1 to go through the whole string
139func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) {
140 data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options))
141 if err != nil {
142 return "", err
143 }
144 //TODO: cache ReplacerData
145
146 return replace(re, data, nil, input, startAt, count)
147}
148
149// ReplaceFunc searches the input string and replaces each match found using the string from the evaluator
150// Count will limit the number of matches attempted and startAt will allow
151// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option).
152// Set startAt and count to -1 to go through the whole string.
153func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) {
154 return replace(re, nil, evaluator, input, startAt, count)
155}
156
157// FindStringMatch searches the input string for a Regexp match
158func (re *Regexp) FindStringMatch(s string) (*Match, error) {
159 // convert string to runes
160 return re.run(false, -1, getRunes(s))
161}
162
163// FindRunesMatch searches the input rune slice for a Regexp match
164func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) {
165 return re.run(false, -1, r)
166}
167
168// FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index
169func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) {
170 if startAt > len(s) {
171 return nil, errors.New("startAt must be less than the length of the input string")
172 }
173 r, startAt := re.getRunesAndStart(s, startAt)
174 if startAt == -1 {
175 // we didn't find our start index in the string -- that's a problem
176 return nil, errors.New("startAt must align to the start of a valid rune in the input string")
177 }
178
179 return re.run(false, startAt, r)
180}
181
182// FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index
183func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) {
184 return re.run(false, startAt, r)
185}
186
187// FindNextMatch returns the next match in the same input string as the match parameter.
188// Will return nil if there is no next match or if given a nil match.
189func (re *Regexp) FindNextMatch(m *Match) (*Match, error) {
190 if m == nil {
191 return nil, nil
192 }
193
194 // If previous match was empty, advance by one before matching to prevent
195 // infinite loop
196 startAt := m.textpos
197 if m.Length == 0 {
198 if m.textpos == len(m.text) {
199 return nil, nil
200 }
201
202 if re.RightToLeft() {
203 startAt--
204 } else {
205 startAt++
206 }
207 }
208 return re.run(false, startAt, m.text)
209}
210
211// MatchString return true if the string matches the regex
212// error will be set if a timeout occurs
213func (re *Regexp) MatchString(s string) (bool, error) {
214 m, err := re.run(true, -1, getRunes(s))
215 if err != nil {
216 return false, err
217 }
218 return m != nil, nil
219}
220
221func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) {
222 if startAt < 0 {
223 if re.RightToLeft() {
224 r := getRunes(s)
225 return r, len(r)
226 }
227 return getRunes(s), 0
228 }
229 ret := make([]rune, len(s))
230 i := 0
231 runeIdx := -1
232 for strIdx, r := range s {
233 if strIdx == startAt {
234 runeIdx = i
235 }
236 ret[i] = r
237 i++
238 }
239 if startAt == len(s) {
240 runeIdx = i
241 }
242 return ret[:i], runeIdx
243}
244
245func getRunes(s string) []rune {
246 return []rune(s)
247}
248
249// MatchRunes return true if the runes matches the regex
250// error will be set if a timeout occurs
251func (re *Regexp) MatchRunes(r []rune) (bool, error) {
252 m, err := re.run(true, -1, r)
253 if err != nil {
254 return false, err
255 }
256 return m != nil, nil
257}
258
259// GetGroupNames Returns the set of strings used to name capturing groups in the expression.
260func (re *Regexp) GetGroupNames() []string {
261 var result []string
262
263 if re.capslist == nil {
264 result = make([]string, re.capsize)
265
266 for i := 0; i < len(result); i++ {
267 result[i] = strconv.Itoa(i)
268 }
269 } else {
270 result = make([]string, len(re.capslist))
271 copy(result, re.capslist)
272 }
273
274 return result
275}
276
277// GetGroupNumbers returns the integer group numbers corresponding to a group name.
278func (re *Regexp) GetGroupNumbers() []int {
279 var result []int
280
281 if re.caps == nil {
282 result = make([]int, re.capsize)
283
284 for i := 0; i < len(result); i++ {
285 result[i] = i
286 }
287 } else {
288 result = make([]int, len(re.caps))
289
290 for k, v := range re.caps {
291 result[v] = k
292 }
293 }
294
295 return result
296}
297
298// GroupNameFromNumber retrieves a group name that corresponds to a group number.
299// It will return "" for and unknown group number. Unnamed groups automatically
300// receive a name that is the decimal string equivalent of its number.
301func (re *Regexp) GroupNameFromNumber(i int) string {
302 if re.capslist == nil {
303 if i >= 0 && i < re.capsize {
304 return strconv.Itoa(i)
305 }
306
307 return ""
308 }
309
310 if re.caps != nil {
311 var ok bool
312 if i, ok = re.caps[i]; !ok {
313 return ""
314 }
315 }
316
317 if i >= 0 && i < len(re.capslist) {
318 return re.capslist[i]
319 }
320
321 return ""
322}
323
324// GroupNumberFromName returns a group number that corresponds to a group name.
325// Returns -1 if the name is not a recognized group name. Numbered groups
326// automatically get a group name that is the decimal string equivalent of its number.
327func (re *Regexp) GroupNumberFromName(name string) int {
328 // look up name if we have a hashtable of names
329 if re.capnames != nil {
330 if k, ok := re.capnames[name]; ok {
331 return k
332 }
333
334 return -1
335 }
336
337 // convert to an int if it looks like a number
338 result := 0
339 for i := 0; i < len(name); i++ {
340 ch := name[i]
341
342 if ch > '9' || ch < '0' {
343 return -1
344 }
345
346 result *= 10
347 result += int(ch - '0')
348 }
349
350 // return int if it's in range
351 if result >= 0 && result < re.capsize {
352 return result
353 }
354
355 return -1
356}