1package chroma
  2
  3import (
  4	"encoding/json"
  5	"fmt"
  6	"os"
  7	"path/filepath"
  8	"regexp"
  9	"sort"
 10	"strings"
 11	"sync"
 12	"time"
 13	"unicode/utf8"
 14
 15	"github.com/dlclark/regexp2"
 16)
 17
 18// A Rule is the fundamental matching unit of the Regex lexer state machine.
 19type Rule struct {
 20	Pattern string
 21	Type    Emitter
 22	Mutator Mutator
 23}
 24
 25// Words creates a regex that matches any of the given literal words.
 26func Words(prefix, suffix string, words ...string) string {
 27	sort.Slice(words, func(i, j int) bool {
 28		return len(words[j]) < len(words[i])
 29	})
 30	for i, word := range words {
 31		words[i] = regexp.QuoteMeta(word)
 32	}
 33	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
 34}
 35
 36// Tokenise text using lexer, returning tokens as a slice.
 37func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
 38	var out []Token
 39	it, err := lexer.Tokenise(options, text)
 40	if err != nil {
 41		return nil, err
 42	}
 43	for t := it(); t != EOF; t = it() {
 44		out = append(out, t)
 45	}
 46	return out, nil
 47}
 48
 49// Rules maps from state to a sequence of Rules.
 50type Rules map[string][]Rule
 51
 52// Rename clones rules then a rule.
 53func (r Rules) Rename(oldRule, newRule string) Rules {
 54	r = r.Clone()
 55	r[newRule] = r[oldRule]
 56	delete(r, oldRule)
 57	return r
 58}
 59
 60// Clone returns a clone of the Rules.
 61func (r Rules) Clone() Rules {
 62	out := map[string][]Rule{}
 63	for key, rules := range r {
 64		out[key] = make([]Rule, len(rules))
 65		copy(out[key], rules)
 66	}
 67	return out
 68}
 69
 70// Merge creates a clone of "r" then merges "rules" into the clone.
 71func (r Rules) Merge(rules Rules) Rules {
 72	out := r.Clone()
 73	for k, v := range rules.Clone() {
 74		out[k] = v
 75	}
 76	return out
 77}
 78
 79// MustNewLexer creates a new Lexer with deferred rules generation or panics.
 80func MustNewLexer(config *Config, rules func() Rules) *RegexLexer {
 81	lexer, err := NewLexer(config, rules)
 82	if err != nil {
 83		panic(err)
 84	}
 85	return lexer
 86}
 87
 88// NewLexer creates a new regex-based Lexer.
 89//
 90// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
 91// that match input, optionally modify lexer state, and output tokens.
 92func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
 93	if config == nil {
 94		config = &Config{}
 95	}
 96	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
 97		_, err := filepath.Match(glob, "")
 98		if err != nil {
 99			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
100		}
101	}
102	r := &RegexLexer{
103		config:         config,
104		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
105	}
106	// One-off code to generate XML lexers in the Chroma source tree.
107	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
108	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
109	// data, err := Marshal(r)
110	// if err != nil {
111	// 	if errors.Is(err, ErrNotSerialisable) {
112	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
113	// 		return r, nil
114	// 	}
115	// 	return nil, err
116	// }
117	// _, file, _, ok := runtime.Caller(2)
118	// if !ok {
119	// 	panic("??")
120	// }
121	// fmt.Println(file)
122	// if strings.Contains(file, "/lexers/") {
123	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
124	// 	err = os.MkdirAll(dir, 0700)
125	// 	if err != nil {
126	// 		return nil, err
127	// 	}
128	// 	filename := filepath.Join(dir, name) + ".xml"
129	// 	fmt.Println(filename)
130	// 	err = ioutil.WriteFile(filename, data, 0600)
131	// 	if err != nil {
132	// 		return nil, err
133	// 	}
134	// }
135	return r, nil
136}
137
138// Trace enables debug tracing.
139//
140// Deprecated: Use SetTracing instead.
141func (r *RegexLexer) Trace(trace bool) *RegexLexer {
142	r.trace = trace
143	return r
144}
145
146// SetTracing enables debug tracing.
147//
148// This complies with the [TracingLexer] interface.
149func (r *RegexLexer) SetTracing(trace bool) {
150	r.trace = trace
151}
152
153// A CompiledRule is a Rule with a pre-compiled regex.
154//
155// Note that regular expressions are lazily compiled on first use of the lexer.
156type CompiledRule struct {
157	Rule
158	Regexp *regexp2.Regexp
159	flags  string
160}
161
162// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
163type CompiledRules map[string][]*CompiledRule
164
165// LexerState contains the state for a single lex.
166type LexerState struct {
167	Lexer    *RegexLexer
168	Registry *LexerRegistry
169	Text     []rune
170	Pos      int
171	Rules    CompiledRules
172	Stack    []string
173	State    string
174	Rule     int
175	// Group matches.
176	Groups []string
177	// Named Group matches.
178	NamedGroups map[string]string
179	// Custum context for mutators.
180	MutatorContext map[interface{}]interface{}
181	iteratorStack  []Iterator
182	options        *TokeniseOptions
183	newlineAdded   bool
184}
185
186// Set mutator context.
187func (l *LexerState) Set(key interface{}, value interface{}) {
188	l.MutatorContext[key] = value
189}
190
191// Get mutator context.
192func (l *LexerState) Get(key interface{}) interface{} {
193	return l.MutatorContext[key]
194}
195
196// Iterator returns the next Token from the lexer.
197func (l *LexerState) Iterator() Token { // nolint: gocognit
198	trace := json.NewEncoder(os.Stderr)
199	end := len(l.Text)
200	if l.newlineAdded {
201		end--
202	}
203	for l.Pos < end && len(l.Stack) > 0 {
204		// Exhaust the iterator stack, if any.
205		for len(l.iteratorStack) > 0 {
206			n := len(l.iteratorStack) - 1
207			t := l.iteratorStack[n]()
208			if t.Type == Ignore {
209				continue
210			}
211			if t == EOF {
212				l.iteratorStack = l.iteratorStack[:n]
213				continue
214			}
215			return t
216		}
217
218		l.State = l.Stack[len(l.Stack)-1]
219		selectedRule, ok := l.Rules[l.State]
220		if !ok {
221			panic("unknown state " + l.State)
222		}
223		var start time.Time
224		if l.Lexer.trace {
225			start = time.Now()
226		}
227		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
228		if l.Lexer.trace {
229			var length int
230			if groups != nil {
231				length = len(groups[0])
232			} else {
233				length = -1
234			}
235			_ = trace.Encode(Trace{ //nolint
236				Lexer:   l.Lexer.config.Name,
237				State:   l.State,
238				Rule:    ruleIndex,
239				Pattern: rule.Pattern,
240				Pos:     l.Pos,
241				Length:  length,
242				Elapsed: float64(time.Since(start)) / float64(time.Millisecond),
243			})
244			// fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q, elapsed=%s\n", l.State, l.Pos, string(l.Text[l.Pos:]), time.Since(start))
245		}
246		// No match.
247		if groups == nil {
248			// From Pygments :\
249			//
250			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
251			// emptied and the lexer continues scanning in the 'root' state. This can help producing
252			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
253			// closed.
254			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
255				l.Stack = []string{l.options.State}
256				continue
257			}
258			l.Pos++
259			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
260		}
261		l.Rule = ruleIndex
262		l.Groups = groups
263		l.NamedGroups = namedGroups
264		l.Pos += utf8.RuneCountInString(groups[0])
265		if rule.Mutator != nil {
266			if err := rule.Mutator.Mutate(l); err != nil {
267				panic(err)
268			}
269		}
270		if rule.Type != nil {
271			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
272		}
273	}
274	// Exhaust the IteratorStack, if any.
275	// Duplicate code, but eh.
276	for len(l.iteratorStack) > 0 {
277		n := len(l.iteratorStack) - 1
278		t := l.iteratorStack[n]()
279		if t.Type == Ignore {
280			continue
281		}
282		if t == EOF {
283			l.iteratorStack = l.iteratorStack[:n]
284			continue
285		}
286		return t
287	}
288
289	// If we get to here and we still have text, return it as an error.
290	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
291		value := string(l.Text[l.Pos:])
292		l.Pos = len(l.Text)
293		return Token{Type: Error, Value: value}
294	}
295	return EOF
296}
297
298// RegexLexer is the default lexer implementation used in Chroma.
299type RegexLexer struct {
300	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
301	config   *Config
302	analyser func(text string) float32
303	trace    bool
304
305	mu             sync.Mutex
306	compiled       bool
307	rawRules       Rules
308	rules          map[string][]*CompiledRule
309	fetchRulesFunc func() (Rules, error)
310	compileOnce    sync.Once
311	compileError   error
312}
313
314func (r *RegexLexer) String() string {
315	return r.config.Name
316}
317
318// Rules in the Lexer.
319func (r *RegexLexer) Rules() (Rules, error) {
320	if err := r.needRules(); err != nil {
321		return nil, err
322	}
323	return r.rawRules, nil
324}
325
326// SetRegistry the lexer will use to lookup other lexers if necessary.
327func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
328	r.registry = registry
329	return r
330}
331
332// SetAnalyser sets the analyser function used to perform content inspection.
333func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
334	r.analyser = analyser
335	return r
336}
337
338// AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
339func (r *RegexLexer) AnalyseText(text string) float32 {
340	if r.analyser != nil {
341		return r.analyser(text)
342	}
343	return 0
344}
345
346// SetConfig replaces the Config for this Lexer.
347func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
348	r.config = config
349	return r
350}
351
352// Config returns the Config for this Lexer.
353func (r *RegexLexer) Config() *Config {
354	return r.config
355}
356
357// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
358func (r *RegexLexer) maybeCompile() (err error) {
359	r.mu.Lock()
360	defer r.mu.Unlock()
361	if r.compiled {
362		return nil
363	}
364	for state, rules := range r.rules {
365		for i, rule := range rules {
366			if rule.Regexp == nil {
367				pattern := "(?:" + rule.Pattern + ")"
368				if rule.flags != "" {
369					pattern = "(?" + rule.flags + ")" + pattern
370				}
371				pattern = `\G` + pattern
372				rule.Regexp, err = regexp2.Compile(pattern, 0)
373				if err != nil {
374					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
375				}
376				rule.Regexp.MatchTimeout = time.Millisecond * 250
377			}
378		}
379	}
380restart:
381	seen := map[LexerMutator]bool{}
382	for state := range r.rules {
383		for i := range len(r.rules[state]) {
384			rule := r.rules[state][i]
385			if compile, ok := rule.Mutator.(LexerMutator); ok {
386				if seen[compile] {
387					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
388				}
389				seen[compile] = true
390				if err := compile.MutateLexer(r.rules, state, i); err != nil {
391					return err
392				}
393				// Process the rules again in case the mutator added/removed rules.
394				//
395				// This sounds bad, but shouldn't be significant in practice.
396				goto restart
397			}
398		}
399	}
400	// Validate emitters
401	for state := range r.rules {
402		for i := range len(r.rules[state]) {
403			rule := r.rules[state][i]
404			if validate, ok := rule.Type.(ValidatingEmitter); ok {
405				if err := validate.ValidateEmitter(rule); err != nil {
406					return fmt.Errorf("%s: %s: %s: %w", r.config.Name, state, rule.Pattern, err)
407				}
408			}
409		}
410	}
411	r.compiled = true
412	return nil
413}
414
415func (r *RegexLexer) fetchRules() error {
416	rules, err := r.fetchRulesFunc()
417	if err != nil {
418		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
419	}
420	if _, ok := rules["root"]; !ok {
421		return fmt.Errorf("no \"root\" state")
422	}
423	compiledRules := map[string][]*CompiledRule{}
424	for state, rules := range rules {
425		compiledRules[state] = nil
426		for _, rule := range rules {
427			flags := ""
428			if !r.config.NotMultiline {
429				flags += "m"
430			}
431			if r.config.CaseInsensitive {
432				flags += "i"
433			}
434			if r.config.DotAll {
435				flags += "s"
436			}
437			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
438		}
439	}
440
441	r.rawRules = rules
442	r.rules = compiledRules
443	return nil
444}
445
446func (r *RegexLexer) needRules() error {
447	var err error
448	if r.fetchRulesFunc != nil {
449		r.compileOnce.Do(func() {
450			r.compileError = r.fetchRules()
451		})
452		if r.compileError != nil {
453			return r.compileError
454		}
455	}
456	if err := r.maybeCompile(); err != nil {
457		return err
458	}
459	return err
460}
461
462// Tokenise text using lexer, returning an iterator.
463func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
464	err := r.needRules()
465	if err != nil {
466		return nil, err
467	}
468	if options == nil {
469		options = defaultOptions
470	}
471	if options.EnsureLF {
472		text = ensureLF(text)
473	}
474	newlineAdded := false
475	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
476		text += "\n"
477		newlineAdded = true
478	}
479	state := &LexerState{
480		Registry:       r.registry,
481		newlineAdded:   newlineAdded,
482		options:        options,
483		Lexer:          r,
484		Text:           []rune(text),
485		Stack:          []string{options.State},
486		Rules:          r.rules,
487		MutatorContext: map[interface{}]interface{}{},
488	}
489	return state.Iterator, nil
490}
491
492// MustRules is like Rules() but will panic on error.
493func (r *RegexLexer) MustRules() Rules {
494	rules, err := r.Rules()
495	if err != nil {
496		panic(err)
497	}
498	return rules
499}
500
501func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
502	for i, rule := range rules {
503		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
504		if match != nil && err == nil && match.Index == pos {
505			groups := []string{}
506			namedGroups := make(map[string]string)
507			for _, g := range match.Groups() {
508				namedGroups[g.Name] = g.String()
509				groups = append(groups, g.String())
510			}
511			return i, rule, groups, namedGroups
512		}
513	}
514	return 0, &CompiledRule{}, nil, nil
515}
516
517// replace \r and \r\n with \n
518// same as strings.ReplaceAll but more efficient
519func ensureLF(text string) string {
520	buf := make([]byte, len(text))
521	var j int
522	for i := range len(text) {
523		c := text[i]
524		if c == '\r' {
525			if i < len(text)-1 && text[i+1] == '\n' {
526				continue
527			}
528			c = '\n'
529		}
530		buf[j] = c
531		j++
532	}
533	return string(buf[:j])
534}