aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/mitjafelicijan/go-tree-sitter/bash/scanner.c
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/mitjafelicijan/go-tree-sitter/bash/scanner.c')
-rw-r--r--vendor/github.com/mitjafelicijan/go-tree-sitter/bash/scanner.c1213
1 files changed, 1213 insertions, 0 deletions
diff --git a/vendor/github.com/mitjafelicijan/go-tree-sitter/bash/scanner.c b/vendor/github.com/mitjafelicijan/go-tree-sitter/bash/scanner.c
new file mode 100644
index 0000000..c123aec
--- /dev/null
+++ b/vendor/github.com/mitjafelicijan/go-tree-sitter/bash/scanner.c
@@ -0,0 +1,1213 @@
1#include "../array.h"
2#include "parser.h"
3
4#include <assert.h>
5#include <ctype.h>
6#include <string.h>
7#include <wctype.h>
8
9enum TokenType {
10 HEREDOC_START,
11 SIMPLE_HEREDOC_BODY,
12 HEREDOC_BODY_BEGINNING,
13 HEREDOC_CONTENT,
14 HEREDOC_END,
15 FILE_DESCRIPTOR,
16 EMPTY_VALUE,
17 CONCAT,
18 VARIABLE_NAME,
19 TEST_OPERATOR,
20 REGEX,
21 REGEX_NO_SLASH,
22 REGEX_NO_SPACE,
23 EXPANSION_WORD,
24 EXTGLOB_PATTERN,
25 BARE_DOLLAR,
26 BRACE_START,
27 IMMEDIATE_DOUBLE_HASH,
28 EXTERNAL_EXPANSION_SYM_HASH,
29 EXTERNAL_EXPANSION_SYM_BANG,
30 EXTERNAL_EXPANSION_SYM_EQUAL,
31 CLOSING_BRACE,
32 CLOSING_BRACKET,
33 HEREDOC_ARROW,
34 HEREDOC_ARROW_DASH,
35 NEWLINE,
36 OPENING_PAREN,
37 ESAC,
38 ERROR_RECOVERY,
39};
40
41typedef Array(char) String;
42
43typedef struct {
44 bool is_raw;
45 bool started;
46 bool allows_indent;
47 String delimiter;
48 String current_leading_word;
49} Heredoc;
50
51#define heredoc_new() \
52 { \
53 .is_raw = false, \
54 .started = false, \
55 .allows_indent = false, \
56 .delimiter = array_new(), \
57 .current_leading_word = array_new(), \
58 };
59
60typedef struct {
61 uint8_t last_glob_paren_depth;
62 bool ext_was_in_double_quote;
63 bool ext_saw_outside_quote;
64 Array(Heredoc) heredocs;
65} Scanner;
66
67static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
68
69static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
70
71static inline bool in_error_recovery(const bool *valid_symbols) { return valid_symbols[ERROR_RECOVERY]; }
72
73static inline void reset_string(String *string) {
74 if (string->size > 0) {
75 memset(string->contents, 0, string->size);
76 array_clear(string);
77 }
78}
79
80static inline void reset_heredoc(Heredoc *heredoc) {
81 heredoc->is_raw = false;
82 heredoc->started = false;
83 heredoc->allows_indent = false;
84 reset_string(&heredoc->delimiter);
85}
86
87static inline void reset(Scanner *scanner) {
88 for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
89 reset_heredoc(array_get(&scanner->heredocs, i));
90 }
91}
92
93static unsigned serialize(Scanner *scanner, char *buffer) {
94 uint32_t size = 0;
95
96 buffer[size++] = (char)scanner->last_glob_paren_depth;
97 buffer[size++] = (char)scanner->ext_was_in_double_quote;
98 buffer[size++] = (char)scanner->ext_saw_outside_quote;
99 buffer[size++] = (char)scanner->heredocs.size;
100
101 for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
102 Heredoc *heredoc = array_get(&scanner->heredocs, i);
103 if (heredoc->delimiter.size + 3 + size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
104 return 0;
105 }
106
107 buffer[size++] = (char)heredoc->is_raw;
108 buffer[size++] = (char)heredoc->started;
109 buffer[size++] = (char)heredoc->allows_indent;
110
111 memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(uint32_t));
112 size += sizeof(uint32_t);
113 memcpy(&buffer[size], heredoc->delimiter.contents, heredoc->delimiter.size);
114 size += heredoc->delimiter.size;
115 }
116 return size;
117}
118
119static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
120 if (length == 0) {
121 reset(scanner);
122 } else {
123 uint32_t size = 0;
124 scanner->last_glob_paren_depth = buffer[size++];
125 scanner->ext_was_in_double_quote = buffer[size++];
126 scanner->ext_saw_outside_quote = buffer[size++];
127 uint32_t heredoc_count = (unsigned char)buffer[size++];
128 for (uint32_t i = 0; i < heredoc_count; i++) {
129 Heredoc *heredoc = NULL;
130 if (i < scanner->heredocs.size) {
131 heredoc = array_get(&scanner->heredocs, i);
132 } else {
133 Heredoc new_heredoc = heredoc_new();
134 array_push(&scanner->heredocs, new_heredoc);
135 heredoc = array_back(&scanner->heredocs);
136 }
137
138 heredoc->is_raw = buffer[size++];
139 heredoc->started = buffer[size++];
140 heredoc->allows_indent = buffer[size++];
141
142 memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(uint32_t));
143 size += sizeof(uint32_t);
144 array_reserve(&heredoc->delimiter, heredoc->delimiter.size);
145
146 memcpy(heredoc->delimiter.contents, &buffer[size], heredoc->delimiter.size);
147 size += heredoc->delimiter.size;
148 }
149 assert(size == length);
150 }
151}
152
153/**
154 * Consume a "word" in POSIX parlance, and returns it unquoted.
155 *
156 * This is an approximate implementation that doesn't deal with any
157 * POSIX-mandated substitution, and assumes the default value for
158 * IFS.
159 */
160static bool advance_word(TSLexer *lexer, String *unquoted_word) {
161 bool empty = true;
162
163 int32_t quote = 0;
164 if (lexer->lookahead == '\'' || lexer->lookahead == '"') {
165 quote = lexer->lookahead;
166 advance(lexer);
167 }
168
169 while (lexer->lookahead &&
170 !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n'
171 : iswspace(lexer->lookahead))) {
172 if (lexer->lookahead == '\\') {
173 advance(lexer);
174 if (!lexer->lookahead) {
175 return false;
176 }
177 }
178 empty = false;
179 array_push(unquoted_word, lexer->lookahead);
180 advance(lexer);
181 }
182 array_push(unquoted_word, '\0');
183
184 if (quote && lexer->lookahead == quote) {
185 advance(lexer);
186 }
187
188 return !empty;
189}
190
191static inline bool scan_bare_dollar(TSLexer *lexer) {
192 while (iswspace(lexer->lookahead) && lexer->lookahead != '\n' && !lexer->eof(lexer)) {
193 skip(lexer);
194 }
195
196 if (lexer->lookahead == '$') {
197 advance(lexer);
198 lexer->result_symbol = BARE_DOLLAR;
199 lexer->mark_end(lexer);
200 return iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == '\"';
201 }
202
203 return false;
204}
205
206static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer) {
207 while (iswspace(lexer->lookahead)) {
208 skip(lexer);
209 }
210
211 lexer->result_symbol = HEREDOC_START;
212 heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\';
213
214 bool found_delimiter = advance_word(lexer, &heredoc->delimiter);
215 if (!found_delimiter) {
216 reset_string(&heredoc->delimiter);
217 return false;
218 }
219 return found_delimiter;
220}
221
222static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer) {
223 reset_string(&heredoc->current_leading_word);
224 // Scan the first 'n' characters on this line, to see if they match the
225 // heredoc delimiter
226 int32_t size = 0;
227 if (heredoc->delimiter.size > 0) {
228 while (lexer->lookahead != '\0' && lexer->lookahead != '\n' &&
229 (int32_t)*array_get(&heredoc->delimiter, size) == lexer->lookahead &&
230 heredoc->current_leading_word.size < heredoc->delimiter.size) {
231 array_push(&heredoc->current_leading_word, lexer->lookahead);
232 advance(lexer);
233 size++;
234 }
235 }
236 array_push(&heredoc->current_leading_word, '\0');
237 return heredoc->delimiter.size == 0
238 ? false
239 : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0;
240}
241
242static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type,
243 enum TokenType end_type) {
244 bool did_advance = false;
245 Heredoc *heredoc = array_back(&scanner->heredocs);
246
247 for (;;) {
248 switch (lexer->lookahead) {
249 case '\0': {
250 if (lexer->eof(lexer) && did_advance) {
251 reset_heredoc(heredoc);
252 lexer->result_symbol = end_type;
253 return true;
254 }
255 return false;
256 }
257
258 case '\\': {
259 did_advance = true;
260 advance(lexer);
261 advance(lexer);
262 break;
263 }
264
265 case '$': {
266 if (heredoc->is_raw) {
267 did_advance = true;
268 advance(lexer);
269 break;
270 }
271 if (did_advance) {
272 lexer->mark_end(lexer);
273 lexer->result_symbol = middle_type;
274 heredoc->started = true;
275 advance(lexer);
276 if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(') {
277 return true;
278 }
279 break;
280 }
281 if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0) {
282 lexer->result_symbol = middle_type;
283 heredoc->started = true;
284 return true;
285 }
286 return false;
287 }
288
289 case '\n': {
290 if (!did_advance) {
291 skip(lexer);
292 } else {
293 advance(lexer);
294 }
295 did_advance = true;
296 if (heredoc->allows_indent) {
297 while (iswspace(lexer->lookahead)) {
298 advance(lexer);
299 }
300 }
301 lexer->result_symbol = heredoc->started ? middle_type : end_type;
302 lexer->mark_end(lexer);
303 if (scan_heredoc_end_identifier(heredoc, lexer)) {
304 if (lexer->result_symbol == HEREDOC_END) {
305 array_pop(&scanner->heredocs);
306 }
307 return true;
308 }
309 break;
310 }
311
312 default: {
313 if (lexer->get_column(lexer) == 0) {
314 // an alternative is to check the starting column of the
315 // heredoc body and track that statefully
316 while (iswspace(lexer->lookahead)) {
317 if (did_advance) {
318 advance(lexer);
319 } else {
320 skip(lexer);
321 }
322 }
323 if (end_type != SIMPLE_HEREDOC_BODY) {
324 lexer->result_symbol = middle_type;
325 if (scan_heredoc_end_identifier(heredoc, lexer)) {
326 return true;
327 }
328 }
329 if (end_type == SIMPLE_HEREDOC_BODY) {
330 lexer->result_symbol = end_type;
331 lexer->mark_end(lexer);
332 if (scan_heredoc_end_identifier(heredoc, lexer)) {
333 return true;
334 }
335 }
336 }
337 did_advance = true;
338 advance(lexer);
339 break;
340 }
341 }
342 }
343}
344
345static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
346 if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) {
347 if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' ||
348 lexer->lookahead == '<' || lexer->lookahead == ')' || lexer->lookahead == '(' ||
349 lexer->lookahead == ';' || lexer->lookahead == '&' || lexer->lookahead == '|' ||
350 (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) ||
351 (lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET]))) {
352 lexer->result_symbol = CONCAT;
353 // So for a`b`, we want to return a concat. We check if the
354 // 2nd backtick has whitespace after it, and if it does we
355 // return concat.
356 if (lexer->lookahead == '`') {
357 lexer->mark_end(lexer);
358 advance(lexer);
359 while (lexer->lookahead != '`' && !lexer->eof(lexer)) {
360 advance(lexer);
361 }
362 if (lexer->eof(lexer)) {
363 return false;
364 }
365 if (lexer->lookahead == '`') {
366 advance(lexer);
367 }
368 return iswspace(lexer->lookahead) || lexer->eof(lexer);
369 }
370 // strings w/ expansions that contains escaped quotes or
371 // backslashes need this to return a concat
372 if (lexer->lookahead == '\\') {
373 lexer->mark_end(lexer);
374 advance(lexer);
375 if (lexer->lookahead == '"' || lexer->lookahead == '\'' || lexer->lookahead == '\\') {
376 return true;
377 }
378 if (lexer->eof(lexer)) {
379 return false;
380 }
381 } else {
382 return true;
383 }
384 }
385 if (iswspace(lexer->lookahead) && valid_symbols[CLOSING_BRACE] && !valid_symbols[EXPANSION_WORD]) {
386 lexer->result_symbol = CONCAT;
387 return true;
388 }
389 }
390
391 if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && !in_error_recovery(valid_symbols)) {
392 // advance two # and ensure not } after
393 if (lexer->lookahead == '#') {
394 lexer->mark_end(lexer);
395 advance(lexer);
396 if (lexer->lookahead == '#') {
397 advance(lexer);
398 if (lexer->lookahead != '}') {
399 lexer->result_symbol = IMMEDIATE_DOUBLE_HASH;
400 lexer->mark_end(lexer);
401 return true;
402 }
403 }
404 }
405 }
406
407 if (valid_symbols[EXTERNAL_EXPANSION_SYM_HASH] && !in_error_recovery(valid_symbols)) {
408 if (lexer->lookahead == '#' || lexer->lookahead == '=' || lexer->lookahead == '!') {
409 lexer->result_symbol = lexer->lookahead == '#' ? EXTERNAL_EXPANSION_SYM_HASH
410 : lexer->lookahead == '!' ? EXTERNAL_EXPANSION_SYM_BANG
411 : EXTERNAL_EXPANSION_SYM_EQUAL;
412 advance(lexer);
413 lexer->mark_end(lexer);
414 while (lexer->lookahead == '#' || lexer->lookahead == '=' || lexer->lookahead == '!') {
415 advance(lexer);
416 }
417 while (iswspace(lexer->lookahead)) {
418 skip(lexer);
419 }
420 if (lexer->lookahead == '}') {
421 return true;
422 }
423 return false;
424 }
425 }
426
427 if (valid_symbols[EMPTY_VALUE]) {
428 if (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == ';' || lexer->lookahead == '&') {
429 lexer->result_symbol = EMPTY_VALUE;
430 return true;
431 }
432 }
433
434 if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 &&
435 !array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols)) {
436 return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY);
437 }
438
439 if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) {
440 Heredoc *heredoc = array_back(&scanner->heredocs);
441 if (scan_heredoc_end_identifier(heredoc, lexer)) {
442 array_delete(&heredoc->current_leading_word);
443 array_delete(&heredoc->delimiter);
444 array_pop(&scanner->heredocs);
445 lexer->result_symbol = HEREDOC_END;
446 return true;
447 }
448 }
449
450 if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started &&
451 !in_error_recovery(valid_symbols)) {
452 return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END);
453 }
454
455 if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0) {
456 return scan_heredoc_start(array_back(&scanner->heredocs), lexer);
457 }
458
459 if (valid_symbols[TEST_OPERATOR] && !valid_symbols[EXPANSION_WORD]) {
460 while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
461 skip(lexer);
462 }
463
464 if (lexer->lookahead == '\\') {
465 if (valid_symbols[EXTGLOB_PATTERN]) {
466 goto extglob_pattern;
467 }
468 if (valid_symbols[REGEX_NO_SPACE]) {
469 goto regex;
470 }
471 skip(lexer);
472
473 if (lexer->eof(lexer)) {
474 return false;
475 }
476
477 if (lexer->lookahead == '\r') {
478 skip(lexer);
479 if (lexer->lookahead == '\n') {
480 skip(lexer);
481 }
482 } else if (lexer->lookahead == '\n') {
483 skip(lexer);
484 } else {
485 return false;
486 }
487
488 while (iswspace(lexer->lookahead)) {
489 skip(lexer);
490 }
491 }
492
493 if (lexer->lookahead == '\n' && !valid_symbols[NEWLINE]) {
494 skip(lexer);
495
496 while (iswspace(lexer->lookahead)) {
497 skip(lexer);
498 }
499 }
500
501 if (lexer->lookahead == '-') {
502 advance(lexer);
503
504 bool advanced_once = false;
505 while (iswalpha(lexer->lookahead)) {
506 advanced_once = true;
507 advance(lexer);
508 }
509
510 if (iswspace(lexer->lookahead) && advanced_once) {
511 lexer->mark_end(lexer);
512 advance(lexer);
513 if (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) {
514 if (valid_symbols[EXPANSION_WORD]) {
515 lexer->mark_end(lexer);
516 lexer->result_symbol = EXPANSION_WORD;
517 return true;
518 }
519 return false;
520 }
521 lexer->result_symbol = TEST_OPERATOR;
522 return true;
523 }
524 if (iswspace(lexer->lookahead) && valid_symbols[EXTGLOB_PATTERN]) {
525 lexer->result_symbol = EXTGLOB_PATTERN;
526 return true;
527 }
528 }
529
530 if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) {
531 return true;
532 }
533 }
534
535 if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) &&
536 !valid_symbols[REGEX_NO_SLASH] && !in_error_recovery(valid_symbols)) {
537 for (;;) {
538 if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' ||
539 (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) &&
540 !valid_symbols[EXPANSION_WORD]) {
541 skip(lexer);
542 } else if (lexer->lookahead == '\\') {
543 skip(lexer);
544
545 if (lexer->eof(lexer)) {
546 lexer->mark_end(lexer);
547 lexer->result_symbol = VARIABLE_NAME;
548 return true;
549 }
550
551 if (lexer->lookahead == '\r') {
552 skip(lexer);
553 }
554 if (lexer->lookahead == '\n') {
555 skip(lexer);
556 } else {
557 if (lexer->lookahead == '\\' && valid_symbols[EXPANSION_WORD]) {
558 goto expansion_word;
559 }
560 return false;
561 }
562 } else {
563 break;
564 }
565 }
566
567 // no '*', '@', '?', '-', '$', '0', '_'
568 if (!valid_symbols[EXPANSION_WORD] &&
569 (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || lexer->lookahead == '-' ||
570 lexer->lookahead == '0' || lexer->lookahead == '_')) {
571 lexer->mark_end(lexer);
572 advance(lexer);
573 if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' ||
574 lexer->lookahead == '-' || lexer->lookahead == '%' || lexer->lookahead == '#' ||
575 lexer->lookahead == '/') {
576 return false;
577 }
578 if (valid_symbols[EXTGLOB_PATTERN] && iswspace(lexer->lookahead)) {
579 lexer->mark_end(lexer);
580 lexer->result_symbol = EXTGLOB_PATTERN;
581 return true;
582 }
583 }
584
585 if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') {
586 advance(lexer);
587 if (lexer->lookahead == '<') {
588 advance(lexer);
589 if (lexer->lookahead == '-') {
590 advance(lexer);
591 Heredoc heredoc = heredoc_new();
592 heredoc.allows_indent = true;
593 array_push(&scanner->heredocs, heredoc);
594 lexer->result_symbol = HEREDOC_ARROW_DASH;
595 } else if (lexer->lookahead == '<' || lexer->lookahead == '=') {
596 return false;
597 } else {
598 Heredoc heredoc = heredoc_new();
599 array_push(&scanner->heredocs, heredoc);
600 lexer->result_symbol = HEREDOC_ARROW;
601 }
602 return true;
603 }
604 return false;
605 }
606
607 bool is_number = true;
608 if (iswdigit(lexer->lookahead)) {
609 advance(lexer);
610 } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
611 is_number = false;
612 advance(lexer);
613 } else {
614 if (lexer->lookahead == '{') {
615 goto brace_start;
616 }
617 if (valid_symbols[EXPANSION_WORD]) {
618 goto expansion_word;
619 }
620 if (valid_symbols[EXTGLOB_PATTERN]) {
621 goto extglob_pattern;
622 }
623 return false;
624 }
625
626 for (;;) {
627 if (iswdigit(lexer->lookahead)) {
628 advance(lexer);
629 } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
630 is_number = false;
631 advance(lexer);
632 } else {
633 break;
634 }
635 }
636
637 if (is_number && valid_symbols[FILE_DESCRIPTOR] && (lexer->lookahead == '>' || lexer->lookahead == '<')) {
638 lexer->result_symbol = FILE_DESCRIPTOR;
639 return true;
640 }
641
642 if (valid_symbols[VARIABLE_NAME]) {
643 if (lexer->lookahead == '+') {
644 lexer->mark_end(lexer);
645 advance(lexer);
646 if (lexer->lookahead == '=' || lexer->lookahead == ':' || valid_symbols[CLOSING_BRACE]) {
647 lexer->result_symbol = VARIABLE_NAME;
648 return true;
649 }
650 return false;
651 }
652 if (lexer->lookahead == '/') {
653 return false;
654 }
655 if (lexer->lookahead == '=' || lexer->lookahead == '[' ||
656 (lexer->lookahead == ':' && !valid_symbols[CLOSING_BRACE] &&
657 !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable
658 // names for function words, only handling : for now? #235
659 lexer->lookahead == '%' ||
660 (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' ||
661 (lexer->lookahead == '-' && valid_symbols[CLOSING_BRACE])) {
662 lexer->mark_end(lexer);
663 lexer->result_symbol = VARIABLE_NAME;
664 return true;
665 }
666
667 if (lexer->lookahead == '?') {
668 lexer->mark_end(lexer);
669 advance(lexer);
670 lexer->result_symbol = VARIABLE_NAME;
671 return iswalpha(lexer->lookahead);
672 }
673 }
674
675 return false;
676 }
677
678 if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) {
679 return true;
680 }
681
682regex:
683 if ((valid_symbols[REGEX] || valid_symbols[REGEX_NO_SLASH] || valid_symbols[REGEX_NO_SPACE]) &&
684 !in_error_recovery(valid_symbols)) {
685 if (valid_symbols[REGEX] || valid_symbols[REGEX_NO_SPACE]) {
686 while (iswspace(lexer->lookahead)) {
687 skip(lexer);
688 }
689 }
690
691 if ((lexer->lookahead != '"' && lexer->lookahead != '\'') ||
692 ((lexer->lookahead == '$' || lexer->lookahead == '\'') && valid_symbols[REGEX_NO_SLASH]) ||
693 (lexer->lookahead == '\'' && valid_symbols[REGEX_NO_SPACE])) {
694 typedef struct {
695 bool done;
696 bool advanced_once;
697 bool found_non_alnumdollarunderdash;
698 bool last_was_escape;
699 bool in_single_quote;
700 uint32_t paren_depth;
701 uint32_t bracket_depth;
702 uint32_t brace_depth;
703 } State;
704
705 if (lexer->lookahead == '$' && valid_symbols[REGEX_NO_SLASH]) {
706 lexer->mark_end(lexer);
707 advance(lexer);
708 if (lexer->lookahead == '(') {
709 return false;
710 }
711 }
712
713 lexer->mark_end(lexer);
714
715 State state = {false, false, false, false, false, 0, 0, 0};
716 while (!state.done) {
717 if (state.in_single_quote) {
718 if (lexer->lookahead == '\'') {
719 state.in_single_quote = false;
720 advance(lexer);
721 lexer->mark_end(lexer);
722 }
723 }
724 switch (lexer->lookahead) {
725 case '\\':
726 state.last_was_escape = true;
727 break;
728 case '\0':
729 return false;
730 case '(':
731 state.paren_depth++;
732 state.last_was_escape = false;
733 break;
734 case '[':
735 state.bracket_depth++;
736 state.last_was_escape = false;
737 break;
738 case '{':
739 if (!state.last_was_escape) {
740 state.brace_depth++;
741 }
742 state.last_was_escape = false;
743 break;
744 case ')':
745 if (state.paren_depth == 0) {
746 state.done = true;
747 }
748 state.paren_depth--;
749 state.last_was_escape = false;
750 break;
751 case ']':
752 if (state.bracket_depth == 0) {
753 state.done = true;
754 }
755 state.bracket_depth--;
756 state.last_was_escape = false;
757 break;
758 case '}':
759 if (state.brace_depth == 0) {
760 state.done = true;
761 }
762 state.brace_depth--;
763 state.last_was_escape = false;
764 break;
765 case '\'':
766 // Enter or exit a single-quoted string.
767 state.in_single_quote = !state.in_single_quote;
768 advance(lexer);
769 state.advanced_once = true;
770 state.last_was_escape = false;
771 continue;
772 default:
773 state.last_was_escape = false;
774 break;
775 }
776
777 if (!state.done) {
778 if (valid_symbols[REGEX]) {
779 bool was_space = !state.in_single_quote && iswspace(lexer->lookahead);
780 advance(lexer);
781 state.advanced_once = true;
782 if (!was_space || state.paren_depth > 0) {
783 lexer->mark_end(lexer);
784 }
785 } else if (valid_symbols[REGEX_NO_SLASH]) {
786 if (lexer->lookahead == '/') {
787 lexer->mark_end(lexer);
788 lexer->result_symbol = REGEX_NO_SLASH;
789 return state.advanced_once;
790 }
791 if (lexer->lookahead == '\\') {
792 advance(lexer);
793 state.advanced_once = true;
794 if (!lexer->eof(lexer) && lexer->lookahead != '[' && lexer->lookahead != '/') {
795 advance(lexer);
796 lexer->mark_end(lexer);
797 }
798 } else {
799 bool was_space = !state.in_single_quote && iswspace(lexer->lookahead);
800 advance(lexer);
801 state.advanced_once = true;
802 if (!was_space) {
803 lexer->mark_end(lexer);
804 }
805 }
806 } else if (valid_symbols[REGEX_NO_SPACE]) {
807 if (lexer->lookahead == '\\') {
808 state.found_non_alnumdollarunderdash = true;
809 advance(lexer);
810 if (!lexer->eof(lexer)) {
811 advance(lexer);
812 }
813 } else if (lexer->lookahead == '$') {
814 lexer->mark_end(lexer);
815 advance(lexer);
816 // do not parse a command
817 // substitution
818 if (lexer->lookahead == '(') {
819 return false;
820 }
821 // end $ always means regex, e.g.
822 // 99999999$
823 if (iswspace(lexer->lookahead)) {
824 lexer->result_symbol = REGEX_NO_SPACE;
825 lexer->mark_end(lexer);
826 return true;
827 }
828 } else {
829 bool was_space = !state.in_single_quote && iswspace(lexer->lookahead);
830 if (was_space && state.paren_depth == 0) {
831 lexer->mark_end(lexer);
832 lexer->result_symbol = REGEX_NO_SPACE;
833 return state.found_non_alnumdollarunderdash;
834 }
835 if (!iswalnum(lexer->lookahead) && lexer->lookahead != '$' && lexer->lookahead != '-' &&
836 lexer->lookahead != '_') {
837 state.found_non_alnumdollarunderdash = true;
838 }
839 advance(lexer);
840 }
841 }
842 }
843 }
844
845 lexer->result_symbol = valid_symbols[REGEX_NO_SLASH] ? REGEX_NO_SLASH
846 : valid_symbols[REGEX_NO_SPACE] ? REGEX_NO_SPACE
847 : REGEX;
848 if (valid_symbols[REGEX] && !state.advanced_once) {
849 return false;
850 }
851 return true;
852 }
853 }
854
855extglob_pattern:
856 if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols)) {
857 // first skip ws, then check for ? * + @ !
858 while (iswspace(lexer->lookahead)) {
859 skip(lexer);
860 }
861
862 if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' ||
863 lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' ||
864 lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead))) {
865 if (lexer->lookahead == '\\') {
866 advance(lexer);
867 if ((iswspace(lexer->lookahead) || lexer->lookahead == '"') && lexer->lookahead != '\r' &&
868 lexer->lookahead != '\n') {
869 advance(lexer);
870 } else {
871 return false;
872 }
873 }
874
875 if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) {
876 lexer->mark_end(lexer);
877 advance(lexer);
878
879 if (iswspace(lexer->lookahead)) {
880 return false;
881 }
882 }
883
884 lexer->mark_end(lexer);
885 bool was_non_alpha = !iswalpha(lexer->lookahead);
886 if (lexer->lookahead != '[') {
887 // no esac
888 if (lexer->lookahead == 'e') {
889 lexer->mark_end(lexer);
890 advance(lexer);
891 if (lexer->lookahead == 's') {
892 advance(lexer);
893 if (lexer->lookahead == 'a') {
894 advance(lexer);
895 if (lexer->lookahead == 'c') {
896 advance(lexer);
897 if (iswspace(lexer->lookahead)) {
898 return false;
899 }
900 }
901 }
902 }
903 } else {
904 advance(lexer);
905 }
906 }
907
908 // -\w is just a word, find something else special
909 if (lexer->lookahead == '-') {
910 lexer->mark_end(lexer);
911 advance(lexer);
912 while (iswalnum(lexer->lookahead)) {
913 advance(lexer);
914 }
915
916 if (lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.') {
917 return false;
918 }
919 lexer->mark_end(lexer);
920 }
921
922 // case item -) or *)
923 if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) {
924 lexer->mark_end(lexer);
925 advance(lexer);
926 if (iswspace(lexer->lookahead)) {
927 lexer->result_symbol = EXTGLOB_PATTERN;
928 return was_non_alpha;
929 }
930 }
931
932 if (iswspace(lexer->lookahead)) {
933 lexer->mark_end(lexer);
934 lexer->result_symbol = EXTGLOB_PATTERN;
935 scanner->last_glob_paren_depth = 0;
936 return true;
937 }
938
939 if (lexer->lookahead == '$') {
940 lexer->mark_end(lexer);
941 advance(lexer);
942 if (lexer->lookahead == '{' || lexer->lookahead == '(') {
943 lexer->result_symbol = EXTGLOB_PATTERN;
944 return true;
945 }
946 }
947
948 if (lexer->lookahead == '|') {
949 lexer->mark_end(lexer);
950 advance(lexer);
951 lexer->result_symbol = EXTGLOB_PATTERN;
952 return true;
953 }
954
955 if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' &&
956 lexer->lookahead != '[' && lexer->lookahead != '?' && lexer->lookahead != '/' &&
957 lexer->lookahead != '\\' && lexer->lookahead != '_' && lexer->lookahead != '*') {
958 return false;
959 }
960
961 typedef struct {
962 bool done;
963 bool saw_non_alphadot;
964 uint32_t paren_depth;
965 uint32_t bracket_depth;
966 uint32_t brace_depth;
967 } State;
968
969 State state = {false, was_non_alpha, scanner->last_glob_paren_depth, 0, 0};
970 while (!state.done) {
971 switch (lexer->lookahead) {
972 case '\0':
973 return false;
974 case '(':
975 state.paren_depth++;
976 break;
977 case '[':
978 state.bracket_depth++;
979 break;
980 case '{':
981 state.brace_depth++;
982 break;
983 case ')':
984 if (state.paren_depth == 0) {
985 state.done = true;
986 }
987 state.paren_depth--;
988 break;
989 case ']':
990 if (state.bracket_depth == 0) {
991 state.done = true;
992 }
993 state.bracket_depth--;
994 break;
995 case '}':
996 if (state.brace_depth == 0) {
997 state.done = true;
998 }
999 state.brace_depth--;
1000 break;
1001 }
1002
1003 if (lexer->lookahead == '|') {
1004 lexer->mark_end(lexer);
1005 advance(lexer);
1006 if (state.paren_depth == 0 && state.bracket_depth == 0 && state.brace_depth == 0) {
1007 lexer->result_symbol = EXTGLOB_PATTERN;
1008 return true;
1009 }
1010 }
1011
1012 if (!state.done) {
1013 bool was_space = iswspace(lexer->lookahead);
1014 if (lexer->lookahead == '$') {
1015 lexer->mark_end(lexer);
1016 if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') {
1017 state.saw_non_alphadot = true;
1018 }
1019 advance(lexer);
1020 if (lexer->lookahead == '(' || lexer->lookahead == '{') {
1021 lexer->result_symbol = EXTGLOB_PATTERN;
1022 scanner->last_glob_paren_depth = state.paren_depth;
1023 return state.saw_non_alphadot;
1024 }
1025 }
1026 if (was_space) {
1027 lexer->mark_end(lexer);
1028 lexer->result_symbol = EXTGLOB_PATTERN;
1029 scanner->last_glob_paren_depth = 0;
1030 return state.saw_non_alphadot;
1031 }
1032 if (lexer->lookahead == '"') {
1033 lexer->mark_end(lexer);
1034 lexer->result_symbol = EXTGLOB_PATTERN;
1035 scanner->last_glob_paren_depth = 0;
1036 return state.saw_non_alphadot;
1037 }
1038 if (lexer->lookahead == '\\') {
1039 if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') {
1040 state.saw_non_alphadot = true;
1041 }
1042 advance(lexer);
1043 if (iswspace(lexer->lookahead) || lexer->lookahead == '"') {
1044 advance(lexer);
1045 }
1046 } else {
1047 if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') {
1048 state.saw_non_alphadot = true;
1049 }
1050 advance(lexer);
1051 }
1052 if (!was_space) {
1053 lexer->mark_end(lexer);
1054 }
1055 }
1056 }
1057
1058 lexer->result_symbol = EXTGLOB_PATTERN;
1059 scanner->last_glob_paren_depth = 0;
1060 return state.saw_non_alphadot;
1061 }
1062 scanner->last_glob_paren_depth = 0;
1063
1064 return false;
1065 }
1066
1067expansion_word:
1068 if (valid_symbols[EXPANSION_WORD]) {
1069 bool advanced_once = false;
1070 bool advance_once_space = false;
1071 for (;;) {
1072 if (lexer->lookahead == '\"') {
1073 return false;
1074 }
1075 if (lexer->lookahead == '$') {
1076 lexer->mark_end(lexer);
1077 advance(lexer);
1078 if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' ||
1079 iswalnum(lexer->lookahead)) {
1080 lexer->result_symbol = EXPANSION_WORD;
1081 return advanced_once;
1082 }
1083 advanced_once = true;
1084 }
1085
1086 if (lexer->lookahead == '}') {
1087 lexer->mark_end(lexer);
1088 lexer->result_symbol = EXPANSION_WORD;
1089 return advanced_once || advance_once_space;
1090 }
1091
1092 if (lexer->lookahead == '(' && !(advanced_once || advance_once_space)) {
1093 lexer->mark_end(lexer);
1094 advance(lexer);
1095 while (lexer->lookahead != ')' && !lexer->eof(lexer)) {
1096 // if we find a $( or ${ assume this is valid and is
1097 // a garbage concatenation of some weird word + an
1098 // expansion
1099 // I wonder where this can fail
1100 if (lexer->lookahead == '$') {
1101 lexer->mark_end(lexer);
1102 advance(lexer);
1103 if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' ||
1104 iswalnum(lexer->lookahead)) {
1105 lexer->result_symbol = EXPANSION_WORD;
1106 return advanced_once;
1107 }
1108 advanced_once = true;
1109 } else {
1110 advanced_once = advanced_once || !iswspace(lexer->lookahead);
1111 advance_once_space = advance_once_space || iswspace(lexer->lookahead);
1112 advance(lexer);
1113 }
1114 }
1115 lexer->mark_end(lexer);
1116 if (lexer->lookahead == ')') {
1117 advanced_once = true;
1118 advance(lexer);
1119 lexer->mark_end(lexer);
1120 if (lexer->lookahead == '}') {
1121 return false;
1122 }
1123 } else {
1124 return false;
1125 }
1126 }
1127
1128 if (lexer->lookahead == '\'') {
1129 return false;
1130 }
1131
1132 if (lexer->eof(lexer)) {
1133 return false;
1134 }
1135 advanced_once = advanced_once || !iswspace(lexer->lookahead);
1136 advance_once_space = advance_once_space || iswspace(lexer->lookahead);
1137 advance(lexer);
1138 }
1139 }
1140
1141brace_start:
1142 if (valid_symbols[BRACE_START] && !in_error_recovery(valid_symbols)) {
1143 while (iswspace(lexer->lookahead)) {
1144 skip(lexer);
1145 }
1146
1147 if (lexer->lookahead != '{') {
1148 return false;
1149 }
1150
1151 advance(lexer);
1152 lexer->mark_end(lexer);
1153
1154 while (isdigit(lexer->lookahead)) {
1155 advance(lexer);
1156 }
1157
1158 if (lexer->lookahead != '.') {
1159 return false;
1160 }
1161 advance(lexer);
1162
1163 if (lexer->lookahead != '.') {
1164 return false;
1165 }
1166 advance(lexer);
1167
1168 while (isdigit(lexer->lookahead)) {
1169 advance(lexer);
1170 }
1171
1172 if (lexer->lookahead != '}') {
1173 return false;
1174 }
1175
1176 lexer->result_symbol = BRACE_START;
1177 return true;
1178 }
1179
1180 return false;
1181}
1182
1183void *tree_sitter_bash_external_scanner_create() {
1184 Scanner *scanner = calloc(1, sizeof(Scanner));
1185 array_init(&scanner->heredocs);
1186 return scanner;
1187}
1188
1189bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
1190 Scanner *scanner = (Scanner *)payload;
1191 return scan(scanner, lexer, valid_symbols);
1192}
1193
1194unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) {
1195 Scanner *scanner = (Scanner *)payload;
1196 return serialize(scanner, state);
1197}
1198
1199void tree_sitter_bash_external_scanner_deserialize(void *payload, const char *state, unsigned length) {
1200 Scanner *scanner = (Scanner *)payload;
1201 deserialize(scanner, state, length);
1202}
1203
1204void tree_sitter_bash_external_scanner_destroy(void *payload) {
1205 Scanner *scanner = (Scanner *)payload;
1206 for (size_t i = 0; i < scanner->heredocs.size; i++) {
1207 Heredoc *heredoc = array_get(&scanner->heredocs, i);
1208 array_delete(&heredoc->current_leading_word);
1209 array_delete(&heredoc->delimiter);
1210 }
1211 array_delete(&scanner->heredocs);
1212 free(scanner);
1213}