1package chroma
2
3import (
4 "encoding/json"
5 "fmt"
6 "os"
7 "path/filepath"
8 "regexp"
9 "sort"
10 "strings"
11 "sync"
12 "time"
13 "unicode/utf8"
14
15 "github.com/dlclark/regexp2"
16)
17
18// A Rule is the fundamental matching unit of the Regex lexer state machine.
19type Rule struct {
20 Pattern string
21 Type Emitter
22 Mutator Mutator
23}
24
25// Words creates a regex that matches any of the given literal words.
26func Words(prefix, suffix string, words ...string) string {
27 sort.Slice(words, func(i, j int) bool {
28 return len(words[j]) < len(words[i])
29 })
30 for i, word := range words {
31 words[i] = regexp.QuoteMeta(word)
32 }
33 return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
34}
35
36// Tokenise text using lexer, returning tokens as a slice.
37func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
38 var out []Token
39 it, err := lexer.Tokenise(options, text)
40 if err != nil {
41 return nil, err
42 }
43 for t := it(); t != EOF; t = it() {
44 out = append(out, t)
45 }
46 return out, nil
47}
48
49// Rules maps from state to a sequence of Rules.
50type Rules map[string][]Rule
51
52// Rename clones rules then a rule.
53func (r Rules) Rename(oldRule, newRule string) Rules {
54 r = r.Clone()
55 r[newRule] = r[oldRule]
56 delete(r, oldRule)
57 return r
58}
59
60// Clone returns a clone of the Rules.
61func (r Rules) Clone() Rules {
62 out := map[string][]Rule{}
63 for key, rules := range r {
64 out[key] = make([]Rule, len(rules))
65 copy(out[key], rules)
66 }
67 return out
68}
69
70// Merge creates a clone of "r" then merges "rules" into the clone.
71func (r Rules) Merge(rules Rules) Rules {
72 out := r.Clone()
73 for k, v := range rules.Clone() {
74 out[k] = v
75 }
76 return out
77}
78
79// MustNewLexer creates a new Lexer with deferred rules generation or panics.
80func MustNewLexer(config *Config, rules func() Rules) *RegexLexer {
81 lexer, err := NewLexer(config, rules)
82 if err != nil {
83 panic(err)
84 }
85 return lexer
86}
87
88// NewLexer creates a new regex-based Lexer.
89//
90// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
91// that match input, optionally modify lexer state, and output tokens.
92func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
93 if config == nil {
94 config = &Config{}
95 }
96 for _, glob := range append(config.Filenames, config.AliasFilenames...) {
97 _, err := filepath.Match(glob, "")
98 if err != nil {
99 return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
100 }
101 }
102 r := &RegexLexer{
103 config: config,
104 fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
105 }
106 // One-off code to generate XML lexers in the Chroma source tree.
107 // var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
108 // name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
109 // data, err := Marshal(r)
110 // if err != nil {
111 // if errors.Is(err, ErrNotSerialisable) {
112 // fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
113 // return r, nil
114 // }
115 // return nil, err
116 // }
117 // _, file, _, ok := runtime.Caller(2)
118 // if !ok {
119 // panic("??")
120 // }
121 // fmt.Println(file)
122 // if strings.Contains(file, "/lexers/") {
123 // dir := filepath.Join(filepath.Dir(file), "embedded")
124 // err = os.MkdirAll(dir, 0700)
125 // if err != nil {
126 // return nil, err
127 // }
128 // filename := filepath.Join(dir, name) + ".xml"
129 // fmt.Println(filename)
130 // err = ioutil.WriteFile(filename, data, 0600)
131 // if err != nil {
132 // return nil, err
133 // }
134 // }
135 return r, nil
136}
137
138// Trace enables debug tracing.
139//
140// Deprecated: Use SetTracing instead.
141func (r *RegexLexer) Trace(trace bool) *RegexLexer {
142 r.trace = trace
143 return r
144}
145
146// SetTracing enables debug tracing.
147//
148// This complies with the [TracingLexer] interface.
149func (r *RegexLexer) SetTracing(trace bool) {
150 r.trace = trace
151}
152
153// A CompiledRule is a Rule with a pre-compiled regex.
154//
155// Note that regular expressions are lazily compiled on first use of the lexer.
156type CompiledRule struct {
157 Rule
158 Regexp *regexp2.Regexp
159 flags string
160}
161
162// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
163type CompiledRules map[string][]*CompiledRule
164
165// LexerState contains the state for a single lex.
166type LexerState struct {
167 Lexer *RegexLexer
168 Registry *LexerRegistry
169 Text []rune
170 Pos int
171 Rules CompiledRules
172 Stack []string
173 State string
174 Rule int
175 // Group matches.
176 Groups []string
177 // Named Group matches.
178 NamedGroups map[string]string
179 // Custum context for mutators.
180 MutatorContext map[interface{}]interface{}
181 iteratorStack []Iterator
182 options *TokeniseOptions
183 newlineAdded bool
184}
185
186// Set mutator context.
187func (l *LexerState) Set(key interface{}, value interface{}) {
188 l.MutatorContext[key] = value
189}
190
191// Get mutator context.
192func (l *LexerState) Get(key interface{}) interface{} {
193 return l.MutatorContext[key]
194}
195
196// Iterator returns the next Token from the lexer.
197func (l *LexerState) Iterator() Token { // nolint: gocognit
198 trace := json.NewEncoder(os.Stderr)
199 end := len(l.Text)
200 if l.newlineAdded {
201 end--
202 }
203 for l.Pos < end && len(l.Stack) > 0 {
204 // Exhaust the iterator stack, if any.
205 for len(l.iteratorStack) > 0 {
206 n := len(l.iteratorStack) - 1
207 t := l.iteratorStack[n]()
208 if t.Type == Ignore {
209 continue
210 }
211 if t == EOF {
212 l.iteratorStack = l.iteratorStack[:n]
213 continue
214 }
215 return t
216 }
217
218 l.State = l.Stack[len(l.Stack)-1]
219 selectedRule, ok := l.Rules[l.State]
220 if !ok {
221 panic("unknown state " + l.State)
222 }
223 var start time.Time
224 if l.Lexer.trace {
225 start = time.Now()
226 }
227 ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
228 if l.Lexer.trace {
229 var length int
230 if groups != nil {
231 length = len(groups[0])
232 } else {
233 length = -1
234 }
235 _ = trace.Encode(Trace{ //nolint
236 Lexer: l.Lexer.config.Name,
237 State: l.State,
238 Rule: ruleIndex,
239 Pattern: rule.Pattern,
240 Pos: l.Pos,
241 Length: length,
242 Elapsed: float64(time.Since(start)) / float64(time.Millisecond),
243 })
244 // fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q, elapsed=%s\n", l.State, l.Pos, string(l.Text[l.Pos:]), time.Since(start))
245 }
246 // No match.
247 if groups == nil {
248 // From Pygments :\
249 //
250 // If the RegexLexer encounters a newline that is flagged as an error token, the stack is
251 // emptied and the lexer continues scanning in the 'root' state. This can help producing
252 // error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
253 // closed.
254 if l.Text[l.Pos] == '\n' && l.State != l.options.State {
255 l.Stack = []string{l.options.State}
256 continue
257 }
258 l.Pos++
259 return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
260 }
261 l.Rule = ruleIndex
262 l.Groups = groups
263 l.NamedGroups = namedGroups
264 l.Pos += utf8.RuneCountInString(groups[0])
265 if rule.Mutator != nil {
266 if err := rule.Mutator.Mutate(l); err != nil {
267 panic(err)
268 }
269 }
270 if rule.Type != nil {
271 l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
272 }
273 }
274 // Exhaust the IteratorStack, if any.
275 // Duplicate code, but eh.
276 for len(l.iteratorStack) > 0 {
277 n := len(l.iteratorStack) - 1
278 t := l.iteratorStack[n]()
279 if t.Type == Ignore {
280 continue
281 }
282 if t == EOF {
283 l.iteratorStack = l.iteratorStack[:n]
284 continue
285 }
286 return t
287 }
288
289 // If we get to here and we still have text, return it as an error.
290 if l.Pos != len(l.Text) && len(l.Stack) == 0 {
291 value := string(l.Text[l.Pos:])
292 l.Pos = len(l.Text)
293 return Token{Type: Error, Value: value}
294 }
295 return EOF
296}
297
298// RegexLexer is the default lexer implementation used in Chroma.
299type RegexLexer struct {
300 registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
301 config *Config
302 analyser func(text string) float32
303 trace bool
304
305 mu sync.Mutex
306 compiled bool
307 rawRules Rules
308 rules map[string][]*CompiledRule
309 fetchRulesFunc func() (Rules, error)
310 compileOnce sync.Once
311 compileError error
312}
313
314func (r *RegexLexer) String() string {
315 return r.config.Name
316}
317
318// Rules in the Lexer.
319func (r *RegexLexer) Rules() (Rules, error) {
320 if err := r.needRules(); err != nil {
321 return nil, err
322 }
323 return r.rawRules, nil
324}
325
326// SetRegistry the lexer will use to lookup other lexers if necessary.
327func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
328 r.registry = registry
329 return r
330}
331
332// SetAnalyser sets the analyser function used to perform content inspection.
333func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
334 r.analyser = analyser
335 return r
336}
337
338// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
339func (r *RegexLexer) AnalyseText(text string) float32 {
340 if r.analyser != nil {
341 return r.analyser(text)
342 }
343 return 0
344}
345
346// SetConfig replaces the Config for this Lexer.
347func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
348 r.config = config
349 return r
350}
351
352// Config returns the Config for this Lexer.
353func (r *RegexLexer) Config() *Config {
354 return r.config
355}
356
357// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
358func (r *RegexLexer) maybeCompile() (err error) {
359 r.mu.Lock()
360 defer r.mu.Unlock()
361 if r.compiled {
362 return nil
363 }
364 for state, rules := range r.rules {
365 for i, rule := range rules {
366 if rule.Regexp == nil {
367 pattern := "(?:" + rule.Pattern + ")"
368 if rule.flags != "" {
369 pattern = "(?" + rule.flags + ")" + pattern
370 }
371 pattern = `\G` + pattern
372 rule.Regexp, err = regexp2.Compile(pattern, 0)
373 if err != nil {
374 return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
375 }
376 rule.Regexp.MatchTimeout = time.Millisecond * 250
377 }
378 }
379 }
380restart:
381 seen := map[LexerMutator]bool{}
382 for state := range r.rules {
383 for i := range len(r.rules[state]) {
384 rule := r.rules[state][i]
385 if compile, ok := rule.Mutator.(LexerMutator); ok {
386 if seen[compile] {
387 return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
388 }
389 seen[compile] = true
390 if err := compile.MutateLexer(r.rules, state, i); err != nil {
391 return err
392 }
393 // Process the rules again in case the mutator added/removed rules.
394 //
395 // This sounds bad, but shouldn't be significant in practice.
396 goto restart
397 }
398 }
399 }
400 // Validate emitters
401 for state := range r.rules {
402 for i := range len(r.rules[state]) {
403 rule := r.rules[state][i]
404 if validate, ok := rule.Type.(ValidatingEmitter); ok {
405 if err := validate.ValidateEmitter(rule); err != nil {
406 return fmt.Errorf("%s: %s: %s: %w", r.config.Name, state, rule.Pattern, err)
407 }
408 }
409 }
410 }
411 r.compiled = true
412 return nil
413}
414
415func (r *RegexLexer) fetchRules() error {
416 rules, err := r.fetchRulesFunc()
417 if err != nil {
418 return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
419 }
420 if _, ok := rules["root"]; !ok {
421 return fmt.Errorf("no \"root\" state")
422 }
423 compiledRules := map[string][]*CompiledRule{}
424 for state, rules := range rules {
425 compiledRules[state] = nil
426 for _, rule := range rules {
427 flags := ""
428 if !r.config.NotMultiline {
429 flags += "m"
430 }
431 if r.config.CaseInsensitive {
432 flags += "i"
433 }
434 if r.config.DotAll {
435 flags += "s"
436 }
437 compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
438 }
439 }
440
441 r.rawRules = rules
442 r.rules = compiledRules
443 return nil
444}
445
446func (r *RegexLexer) needRules() error {
447 var err error
448 if r.fetchRulesFunc != nil {
449 r.compileOnce.Do(func() {
450 r.compileError = r.fetchRules()
451 })
452 if r.compileError != nil {
453 return r.compileError
454 }
455 }
456 if err := r.maybeCompile(); err != nil {
457 return err
458 }
459 return err
460}
461
462// Tokenise text using lexer, returning an iterator.
463func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
464 err := r.needRules()
465 if err != nil {
466 return nil, err
467 }
468 if options == nil {
469 options = defaultOptions
470 }
471 if options.EnsureLF {
472 text = ensureLF(text)
473 }
474 newlineAdded := false
475 if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
476 text += "\n"
477 newlineAdded = true
478 }
479 state := &LexerState{
480 Registry: r.registry,
481 newlineAdded: newlineAdded,
482 options: options,
483 Lexer: r,
484 Text: []rune(text),
485 Stack: []string{options.State},
486 Rules: r.rules,
487 MutatorContext: map[interface{}]interface{}{},
488 }
489 return state.Iterator, nil
490}
491
492// MustRules is like Rules() but will panic on error.
493func (r *RegexLexer) MustRules() Rules {
494 rules, err := r.Rules()
495 if err != nil {
496 panic(err)
497 }
498 return rules
499}
500
501func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
502 for i, rule := range rules {
503 match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
504 if match != nil && err == nil && match.Index == pos {
505 groups := []string{}
506 namedGroups := make(map[string]string)
507 for _, g := range match.Groups() {
508 namedGroups[g.Name] = g.String()
509 groups = append(groups, g.String())
510 }
511 return i, rule, groups, namedGroups
512 }
513 }
514 return 0, &CompiledRule{}, nil, nil
515}
516
517// replace \r and \r\n with \n
518// same as strings.ReplaceAll but more efficient
519func ensureLF(text string) string {
520 buf := make([]byte, len(text))
521 var j int
522 for i := range len(text) {
523 c := text[i]
524 if c == '\r' {
525 if i < len(text)-1 && text[i+1] == '\n' {
526 continue
527 }
528 c = '\n'
529 }
530 buf[j] = c
531 j++
532 }
533 return string(buf[:j])
534}