1package chroma
  2
  3import (
  4	"fmt"
  5	"os"
  6	"path/filepath"
  7	"regexp"
  8	"sort"
  9	"strings"
 10	"sync"
 11	"time"
 12	"unicode/utf8"
 13
 14	"github.com/dlclark/regexp2"
 15)
 16
 17// A Rule is the fundamental matching unit of the Regex lexer state machine.
 18type Rule struct {
 19	Pattern string
 20	Type    Emitter
 21	Mutator Mutator
 22}
 23
 24// Words creates a regex that matches any of the given literal words.
 25func Words(prefix, suffix string, words ...string) string {
 26	sort.Slice(words, func(i, j int) bool {
 27		return len(words[j]) < len(words[i])
 28	})
 29	for i, word := range words {
 30		words[i] = regexp.QuoteMeta(word)
 31	}
 32	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
 33}
 34
 35// Tokenise text using lexer, returning tokens as a slice.
 36func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
 37	var out []Token
 38	it, err := lexer.Tokenise(options, text)
 39	if err != nil {
 40		return nil, err
 41	}
 42	for t := it(); t != EOF; t = it() {
 43		out = append(out, t)
 44	}
 45	return out, nil
 46}
 47
 48// Rules maps from state to a sequence of Rules.
 49type Rules map[string][]Rule
 50
 51// Rename clones rules then a rule.
 52func (r Rules) Rename(oldRule, newRule string) Rules {
 53	r = r.Clone()
 54	r[newRule] = r[oldRule]
 55	delete(r, oldRule)
 56	return r
 57}
 58
 59// Clone returns a clone of the Rules.
 60func (r Rules) Clone() Rules {
 61	out := map[string][]Rule{}
 62	for key, rules := range r {
 63		out[key] = make([]Rule, len(rules))
 64		copy(out[key], rules)
 65	}
 66	return out
 67}
 68
 69// Merge creates a clone of "r" then merges "rules" into the clone.
 70func (r Rules) Merge(rules Rules) Rules {
 71	out := r.Clone()
 72	for k, v := range rules.Clone() {
 73		out[k] = v
 74	}
 75	return out
 76}
 77
 78// MustNewLexer creates a new Lexer with deferred rules generation or panics.
 79func MustNewLexer(config *Config, rulesFunc func() Rules) *RegexLexer {
 80	lexer, err := NewLexer(config, rulesFunc)
 81	if err != nil {
 82		panic(err)
 83	}
 84	return lexer
 85}
 86
 87// NewLexer creates a new regex-based Lexer.
 88//
 89// "rules" is a state machine transition map. Each key is a state. Values are sets of rules
 90// that match input, optionally modify lexer state, and output tokens.
 91func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
 92	if config == nil {
 93		config = &Config{}
 94	}
 95	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
 96		_, err := filepath.Match(glob, "")
 97		if err != nil {
 98			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
 99		}
100	}
101	r := &RegexLexer{
102		config:         config,
103		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
104	}
105	// One-off code to generate XML lexers in the Chroma source tree.
106	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
107	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
108	// data, err := Marshal(r)
109	// if err != nil {
110	// 	if errors.Is(err, ErrNotSerialisable) {
111	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
112	// 		return r, nil
113	// 	}
114	// 	return nil, err
115	// }
116	// _, file, _, ok := runtime.Caller(2)
117	// if !ok {
118	// 	panic("??")
119	// }
120	// fmt.Println(file)
121	// if strings.Contains(file, "/lexers/") {
122	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
123	// 	err = os.MkdirAll(dir, 0700)
124	// 	if err != nil {
125	// 		return nil, err
126	// 	}
127	// 	filename := filepath.Join(dir, name) + ".xml"
128	// 	fmt.Println(filename)
129	// 	err = ioutil.WriteFile(filename, data, 0600)
130	// 	if err != nil {
131	// 		return nil, err
132	// 	}
133	// }
134	return r, nil
135}
136
137// Trace enables debug tracing.
138func (r *RegexLexer) Trace(trace bool) *RegexLexer {
139	r.trace = trace
140	return r
141}
142
143// A CompiledRule is a Rule with a pre-compiled regex.
144//
145// Note that regular expressions are lazily compiled on first use of the lexer.
146type CompiledRule struct {
147	Rule
148	Regexp *regexp2.Regexp
149	flags  string
150}
151
152// CompiledRules is a map of rule name to sequence of compiled rules in that rule.
153type CompiledRules map[string][]*CompiledRule
154
155// LexerState contains the state for a single lex.
156type LexerState struct {
157	Lexer    *RegexLexer
158	Registry *LexerRegistry
159	Text     []rune
160	Pos      int
161	Rules    CompiledRules
162	Stack    []string
163	State    string
164	Rule     int
165	// Group matches.
166	Groups []string
167	// Named Group matches.
168	NamedGroups map[string]string
169	// Custum context for mutators.
170	MutatorContext map[interface{}]interface{}
171	iteratorStack  []Iterator
172	options        *TokeniseOptions
173	newlineAdded   bool
174}
175
176// Set mutator context.
177func (l *LexerState) Set(key interface{}, value interface{}) {
178	l.MutatorContext[key] = value
179}
180
181// Get mutator context.
182func (l *LexerState) Get(key interface{}) interface{} {
183	return l.MutatorContext[key]
184}
185
186// Iterator returns the next Token from the lexer.
187func (l *LexerState) Iterator() Token { // nolint: gocognit
188	end := len(l.Text)
189	if l.newlineAdded {
190		end--
191	}
192	for l.Pos < end && len(l.Stack) > 0 {
193		// Exhaust the iterator stack, if any.
194		for len(l.iteratorStack) > 0 {
195			n := len(l.iteratorStack) - 1
196			t := l.iteratorStack[n]()
197			if t == EOF {
198				l.iteratorStack = l.iteratorStack[:n]
199				continue
200			}
201			return t
202		}
203
204		l.State = l.Stack[len(l.Stack)-1]
205		if l.Lexer.trace {
206			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
207		}
208		selectedRule, ok := l.Rules[l.State]
209		if !ok {
210			panic("unknown state " + l.State)
211		}
212		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
213		// No match.
214		if groups == nil {
215			// From Pygments :\
216			//
217			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
218			// emptied and the lexer continues scanning in the 'root' state. This can help producing
219			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
220			// closed.
221			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
222				l.Stack = []string{l.options.State}
223				continue
224			}
225			l.Pos++
226			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
227		}
228		l.Rule = ruleIndex
229		l.Groups = groups
230		l.NamedGroups = namedGroups
231		l.Pos += utf8.RuneCountInString(groups[0])
232		if rule.Mutator != nil {
233			if err := rule.Mutator.Mutate(l); err != nil {
234				panic(err)
235			}
236		}
237		if rule.Type != nil {
238			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
239		}
240	}
241	// Exhaust the IteratorStack, if any.
242	// Duplicate code, but eh.
243	for len(l.iteratorStack) > 0 {
244		n := len(l.iteratorStack) - 1
245		t := l.iteratorStack[n]()
246		if t == EOF {
247			l.iteratorStack = l.iteratorStack[:n]
248			continue
249		}
250		return t
251	}
252
253	// If we get to here and we still have text, return it as an error.
254	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
255		value := string(l.Text[l.Pos:])
256		l.Pos = len(l.Text)
257		return Token{Type: Error, Value: value}
258	}
259	return EOF
260}
261
262// RegexLexer is the default lexer implementation used in Chroma.
263type RegexLexer struct {
264	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
265	config   *Config
266	analyser func(text string) float32
267	trace    bool
268
269	mu             sync.Mutex
270	compiled       bool
271	rawRules       Rules
272	rules          map[string][]*CompiledRule
273	fetchRulesFunc func() (Rules, error)
274	compileOnce    sync.Once
275}
276
277func (r *RegexLexer) String() string {
278	return r.config.Name
279}
280
281// Rules in the Lexer.
282func (r *RegexLexer) Rules() (Rules, error) {
283	if err := r.needRules(); err != nil {
284		return nil, err
285	}
286	return r.rawRules, nil
287}
288
289// SetRegistry the lexer will use to lookup other lexers if necessary.
290func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
291	r.registry = registry
292	return r
293}
294
295// SetAnalyser sets the analyser function used to perform content inspection.
296func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
297	r.analyser = analyser
298	return r
299}
300
301func (r *RegexLexer) AnalyseText(text string) float32 { // nolint
302	if r.analyser != nil {
303		return r.analyser(text)
304	}
305	return 0.0
306}
307
308// SetConfig replaces the Config for this Lexer.
309func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
310	r.config = config
311	return r
312}
313
314func (r *RegexLexer) Config() *Config { // nolint
315	return r.config
316}
317
318// Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
319func (r *RegexLexer) maybeCompile() (err error) {
320	r.mu.Lock()
321	defer r.mu.Unlock()
322	if r.compiled {
323		return nil
324	}
325	for state, rules := range r.rules {
326		for i, rule := range rules {
327			if rule.Regexp == nil {
328				pattern := "(?:" + rule.Pattern + ")"
329				if rule.flags != "" {
330					pattern = "(?" + rule.flags + ")" + pattern
331				}
332				pattern = `\G` + pattern
333				rule.Regexp, err = regexp2.Compile(pattern, regexp2.RE2)
334				if err != nil {
335					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
336				}
337				rule.Regexp.MatchTimeout = time.Millisecond * 250
338			}
339		}
340	}
341restart:
342	seen := map[LexerMutator]bool{}
343	for state := range r.rules {
344		for i := 0; i < len(r.rules[state]); i++ {
345			rule := r.rules[state][i]
346			if compile, ok := rule.Mutator.(LexerMutator); ok {
347				if seen[compile] {
348					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
349				}
350				seen[compile] = true
351				if err := compile.MutateLexer(r.rules, state, i); err != nil {
352					return err
353				}
354				// Process the rules again in case the mutator added/removed rules.
355				//
356				// This sounds bad, but shouldn't be significant in practice.
357				goto restart
358			}
359		}
360	}
361	r.compiled = true
362	return nil
363}
364
365func (r *RegexLexer) fetchRules() error {
366	rules, err := r.fetchRulesFunc()
367	if err != nil {
368		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
369	}
370	if _, ok := rules["root"]; !ok {
371		return fmt.Errorf("no \"root\" state")
372	}
373	compiledRules := map[string][]*CompiledRule{}
374	for state, rules := range rules {
375		compiledRules[state] = nil
376		for _, rule := range rules {
377			flags := ""
378			if !r.config.NotMultiline {
379				flags += "m"
380			}
381			if r.config.CaseInsensitive {
382				flags += "i"
383			}
384			if r.config.DotAll {
385				flags += "s"
386			}
387			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
388		}
389	}
390
391	r.rawRules = rules
392	r.rules = compiledRules
393	return nil
394}
395
396func (r *RegexLexer) needRules() error {
397	var err error
398	if r.fetchRulesFunc != nil {
399		r.compileOnce.Do(func() {
400			err = r.fetchRules()
401		})
402	}
403	if err := r.maybeCompile(); err != nil {
404		return err
405	}
406	return err
407}
408
409func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) { // nolint
410	err := r.needRules()
411	if err != nil {
412		return nil, err
413	}
414	if options == nil {
415		options = defaultOptions
416	}
417	if options.EnsureLF {
418		text = ensureLF(text)
419	}
420	newlineAdded := false
421	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
422		text += "\n"
423		newlineAdded = true
424	}
425	state := &LexerState{
426		Registry:       r.registry,
427		newlineAdded:   newlineAdded,
428		options:        options,
429		Lexer:          r,
430		Text:           []rune(text),
431		Stack:          []string{options.State},
432		Rules:          r.rules,
433		MutatorContext: map[interface{}]interface{}{},
434	}
435	return state.Iterator, nil
436}
437
438// MustRules is like Rules() but will panic on error.
439func (r *RegexLexer) MustRules() Rules {
440	rules, err := r.Rules()
441	if err != nil {
442		panic(err)
443	}
444	return rules
445}
446
447func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
448	for i, rule := range rules {
449		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
450		if match != nil && err == nil && match.Index == pos {
451			groups := []string{}
452			namedGroups := make(map[string]string)
453			for _, g := range match.Groups() {
454				namedGroups[g.Name] = g.String()
455				groups = append(groups, g.String())
456			}
457			return i, rule, groups, namedGroups
458		}
459	}
460	return 0, &CompiledRule{}, nil, nil
461}
462
463// replace \r and \r\n with \n
464// same as strings.ReplaceAll but more efficient
465func ensureLF(text string) string {
466	buf := make([]byte, len(text))
467	var j int
468	for i := 0; i < len(text); i++ {
469		c := text[i]
470		if c == '\r' {
471			if i < len(text)-1 && text[i+1] == '\n' {
472				continue
473			}
474			c = '\n'
475		}
476		buf[j] = c
477		j++
478	}
479	return string(buf[:j])
480}