1#include "../array.h"
   2#include "parser.h"
   3
   4#include <assert.h>
   5#include <ctype.h>
   6#include <string.h>
   7#include <wctype.h>
   8
   9enum TokenType {
  10    HEREDOC_START,
  11    SIMPLE_HEREDOC_BODY,
  12    HEREDOC_BODY_BEGINNING,
  13    HEREDOC_CONTENT,
  14    HEREDOC_END,
  15    FILE_DESCRIPTOR,
  16    EMPTY_VALUE,
  17    CONCAT,
  18    VARIABLE_NAME,
  19    TEST_OPERATOR,
  20    REGEX,
  21    REGEX_NO_SLASH,
  22    REGEX_NO_SPACE,
  23    EXPANSION_WORD,
  24    EXTGLOB_PATTERN,
  25    BARE_DOLLAR,
  26    BRACE_START,
  27    IMMEDIATE_DOUBLE_HASH,
  28    EXTERNAL_EXPANSION_SYM_HASH,
  29    EXTERNAL_EXPANSION_SYM_BANG,
  30    EXTERNAL_EXPANSION_SYM_EQUAL,
  31    CLOSING_BRACE,
  32    CLOSING_BRACKET,
  33    HEREDOC_ARROW,
  34    HEREDOC_ARROW_DASH,
  35    NEWLINE,
  36    OPENING_PAREN,
  37    ESAC,
  38    ERROR_RECOVERY,
  39};
  40
  41typedef Array(char) String;
  42
  43typedef struct {
  44    bool is_raw;
  45    bool started;
  46    bool allows_indent;
  47    String delimiter;
  48    String current_leading_word;
  49} Heredoc;
  50
  51#define heredoc_new()                                                                                                  \
  52    {                                                                                                                  \
  53        .is_raw = false,                                                                                               \
  54        .started = false,                                                                                              \
  55        .allows_indent = false,                                                                                        \
  56        .delimiter = array_new(),                                                                                      \
  57        .current_leading_word = array_new(),                                                                           \
  58    };
  59
  60typedef struct {
  61    uint8_t last_glob_paren_depth;
  62    bool ext_was_in_double_quote;
  63    bool ext_saw_outside_quote;
  64    Array(Heredoc) heredocs;
  65} Scanner;
  66
  67static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
  68
  69static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
  70
  71static inline bool in_error_recovery(const bool *valid_symbols) { return valid_symbols[ERROR_RECOVERY]; }
  72
  73static inline void reset_string(String *string) {
  74    if (string->size > 0) {
  75        memset(string->contents, 0, string->size);
  76        array_clear(string);
  77    }
  78}
  79
  80static inline void reset_heredoc(Heredoc *heredoc) {
  81    heredoc->is_raw = false;
  82    heredoc->started = false;
  83    heredoc->allows_indent = false;
  84    reset_string(&heredoc->delimiter);
  85}
  86
  87static inline void reset(Scanner *scanner) {
  88    for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
  89        reset_heredoc(array_get(&scanner->heredocs, i));
  90    }
  91}
  92
  93static unsigned serialize(Scanner *scanner, char *buffer) {
  94    uint32_t size = 0;
  95
  96    buffer[size++] = (char)scanner->last_glob_paren_depth;
  97    buffer[size++] = (char)scanner->ext_was_in_double_quote;
  98    buffer[size++] = (char)scanner->ext_saw_outside_quote;
  99    buffer[size++] = (char)scanner->heredocs.size;
 100
 101    for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
 102        Heredoc *heredoc = array_get(&scanner->heredocs, i);
 103        if (heredoc->delimiter.size + 3 + size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
 104            return 0;
 105        }
 106
 107        buffer[size++] = (char)heredoc->is_raw;
 108        buffer[size++] = (char)heredoc->started;
 109        buffer[size++] = (char)heredoc->allows_indent;
 110
 111        memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(uint32_t));
 112        size += sizeof(uint32_t);
 113        memcpy(&buffer[size], heredoc->delimiter.contents, heredoc->delimiter.size);
 114        size += heredoc->delimiter.size;
 115    }
 116    return size;
 117}
 118
 119static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
 120    if (length == 0) {
 121        reset(scanner);
 122    } else {
 123        uint32_t size = 0;
 124        scanner->last_glob_paren_depth = buffer[size++];
 125        scanner->ext_was_in_double_quote = buffer[size++];
 126        scanner->ext_saw_outside_quote = buffer[size++];
 127        uint32_t heredoc_count = (unsigned char)buffer[size++];
 128        for (uint32_t i = 0; i < heredoc_count; i++) {
 129            Heredoc *heredoc = NULL;
 130            if (i < scanner->heredocs.size) {
 131                heredoc = array_get(&scanner->heredocs, i);
 132            } else {
 133                Heredoc new_heredoc = heredoc_new();
 134                array_push(&scanner->heredocs, new_heredoc);
 135                heredoc = array_back(&scanner->heredocs);
 136            }
 137
 138            heredoc->is_raw = buffer[size++];
 139            heredoc->started = buffer[size++];
 140            heredoc->allows_indent = buffer[size++];
 141
 142            memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(uint32_t));
 143            size += sizeof(uint32_t);
 144            array_reserve(&heredoc->delimiter, heredoc->delimiter.size);
 145
 146            memcpy(heredoc->delimiter.contents, &buffer[size], heredoc->delimiter.size);
 147            size += heredoc->delimiter.size;
 148        }
 149        assert(size == length);
 150    }
 151}
 152
 153/**
 154 * Consume a "word" in POSIX parlance, and returns it unquoted.
 155 *
 156 * This is an approximate implementation that doesn't deal with any
 157 * POSIX-mandated substitution, and assumes the default value for
 158 * IFS.
 159 */
 160static bool advance_word(TSLexer *lexer, String *unquoted_word) {
 161    bool empty = true;
 162
 163    int32_t quote = 0;
 164    if (lexer->lookahead == '\'' || lexer->lookahead == '"') {
 165        quote = lexer->lookahead;
 166        advance(lexer);
 167    }
 168
 169    while (lexer->lookahead &&
 170           !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' || lexer->lookahead == '\n'
 171                   : iswspace(lexer->lookahead))) {
 172        if (lexer->lookahead == '\\') {
 173            advance(lexer);
 174            if (!lexer->lookahead) {
 175                return false;
 176            }
 177        }
 178        empty = false;
 179        array_push(unquoted_word, lexer->lookahead);
 180        advance(lexer);
 181    }
 182    array_push(unquoted_word, '\0');
 183
 184    if (quote && lexer->lookahead == quote) {
 185        advance(lexer);
 186    }
 187
 188    return !empty;
 189}
 190
 191static inline bool scan_bare_dollar(TSLexer *lexer) {
 192    while (iswspace(lexer->lookahead) && lexer->lookahead != '\n' && !lexer->eof(lexer)) {
 193        skip(lexer);
 194    }
 195
 196    if (lexer->lookahead == '$') {
 197        advance(lexer);
 198        lexer->result_symbol = BARE_DOLLAR;
 199        lexer->mark_end(lexer);
 200        return iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == '\"';
 201    }
 202
 203    return false;
 204}
 205
 206static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer) {
 207    while (iswspace(lexer->lookahead)) {
 208        skip(lexer);
 209    }
 210
 211    lexer->result_symbol = HEREDOC_START;
 212    heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' || lexer->lookahead == '\\';
 213
 214    bool found_delimiter = advance_word(lexer, &heredoc->delimiter);
 215    if (!found_delimiter) {
 216        reset_string(&heredoc->delimiter);
 217        return false;
 218    }
 219    return found_delimiter;
 220}
 221
 222static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer) {
 223    reset_string(&heredoc->current_leading_word);
 224    // Scan the first 'n' characters on this line, to see if they match the
 225    // heredoc delimiter
 226    int32_t size = 0;
 227    if (heredoc->delimiter.size > 0) {
 228        while (lexer->lookahead != '\0' && lexer->lookahead != '\n' &&
 229               (int32_t)*array_get(&heredoc->delimiter, size) == lexer->lookahead &&
 230               heredoc->current_leading_word.size < heredoc->delimiter.size) {
 231            array_push(&heredoc->current_leading_word, lexer->lookahead);
 232            advance(lexer);
 233            size++;
 234        }
 235    }
 236    array_push(&heredoc->current_leading_word, '\0');
 237    return heredoc->delimiter.size == 0
 238               ? false
 239               : strcmp(heredoc->current_leading_word.contents, heredoc->delimiter.contents) == 0;
 240}
 241
 242static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer, enum TokenType middle_type,
 243                                 enum TokenType end_type) {
 244    bool did_advance = false;
 245    Heredoc *heredoc = array_back(&scanner->heredocs);
 246
 247    for (;;) {
 248        switch (lexer->lookahead) {
 249            case '\0': {
 250                if (lexer->eof(lexer) && did_advance) {
 251                    reset_heredoc(heredoc);
 252                    lexer->result_symbol = end_type;
 253                    return true;
 254                }
 255                return false;
 256            }
 257
 258            case '\\': {
 259                did_advance = true;
 260                advance(lexer);
 261                advance(lexer);
 262                break;
 263            }
 264
 265            case '$': {
 266                if (heredoc->is_raw) {
 267                    did_advance = true;
 268                    advance(lexer);
 269                    break;
 270                }
 271                if (did_advance) {
 272                    lexer->mark_end(lexer);
 273                    lexer->result_symbol = middle_type;
 274                    heredoc->started = true;
 275                    advance(lexer);
 276                    if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' || lexer->lookahead == '(') {
 277                        return true;
 278                    }
 279                    break;
 280                }
 281                if (middle_type == HEREDOC_BODY_BEGINNING && lexer->get_column(lexer) == 0) {
 282                    lexer->result_symbol = middle_type;
 283                    heredoc->started = true;
 284                    return true;
 285                }
 286                return false;
 287            }
 288
 289            case '\n': {
 290                if (!did_advance) {
 291                    skip(lexer);
 292                } else {
 293                    advance(lexer);
 294                }
 295                did_advance = true;
 296                if (heredoc->allows_indent) {
 297                    while (iswspace(lexer->lookahead)) {
 298                        advance(lexer);
 299                    }
 300                }
 301                lexer->result_symbol = heredoc->started ? middle_type : end_type;
 302                lexer->mark_end(lexer);
 303                if (scan_heredoc_end_identifier(heredoc, lexer)) {
 304                    if (lexer->result_symbol == HEREDOC_END) {
 305                        array_pop(&scanner->heredocs);
 306                    }
 307                    return true;
 308                }
 309                break;
 310            }
 311
 312            default: {
 313                if (lexer->get_column(lexer) == 0) {
 314                    // an alternative is to check the starting column of the
 315                    // heredoc body and track that statefully
 316                    while (iswspace(lexer->lookahead)) {
 317                        if (did_advance) {
 318                            advance(lexer);
 319                        } else {
 320                            skip(lexer);
 321                        }
 322                    }
 323                    if (end_type != SIMPLE_HEREDOC_BODY) {
 324                        lexer->result_symbol = middle_type;
 325                        if (scan_heredoc_end_identifier(heredoc, lexer)) {
 326                            return true;
 327                        }
 328                    }
 329                    if (end_type == SIMPLE_HEREDOC_BODY) {
 330                        lexer->result_symbol = end_type;
 331                        lexer->mark_end(lexer);
 332                        if (scan_heredoc_end_identifier(heredoc, lexer)) {
 333                            return true;
 334                        }
 335                    }
 336                }
 337                did_advance = true;
 338                advance(lexer);
 339                break;
 340            }
 341        }
 342    }
 343}
 344
 345static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
 346    if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) {
 347        if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) || lexer->lookahead == '>' ||
 348              lexer->lookahead == '<' || lexer->lookahead == ')' || lexer->lookahead == '(' ||
 349              lexer->lookahead == ';' || lexer->lookahead == '&' || lexer->lookahead == '|' ||
 350              (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) ||
 351              (lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET]))) {
 352            lexer->result_symbol = CONCAT;
 353            // So for a`b`, we want to return a concat. We check if the
 354            // 2nd backtick has whitespace after it, and if it does we
 355            // return concat.
 356            if (lexer->lookahead == '`') {
 357                lexer->mark_end(lexer);
 358                advance(lexer);
 359                while (lexer->lookahead != '`' && !lexer->eof(lexer)) {
 360                    advance(lexer);
 361                }
 362                if (lexer->eof(lexer)) {
 363                    return false;
 364                }
 365                if (lexer->lookahead == '`') {
 366                    advance(lexer);
 367                }
 368                return iswspace(lexer->lookahead) || lexer->eof(lexer);
 369            }
 370            // strings w/ expansions that contains escaped quotes or
 371            // backslashes need this to return a concat
 372            if (lexer->lookahead == '\\') {
 373                lexer->mark_end(lexer);
 374                advance(lexer);
 375                if (lexer->lookahead == '"' || lexer->lookahead == '\'' || lexer->lookahead == '\\') {
 376                    return true;
 377                }
 378                if (lexer->eof(lexer)) {
 379                    return false;
 380                }
 381            } else {
 382                return true;
 383            }
 384        }
 385        if (iswspace(lexer->lookahead) && valid_symbols[CLOSING_BRACE] && !valid_symbols[EXPANSION_WORD]) {
 386            lexer->result_symbol = CONCAT;
 387            return true;
 388        }
 389    }
 390
 391    if (valid_symbols[IMMEDIATE_DOUBLE_HASH] && !in_error_recovery(valid_symbols)) {
 392        // advance two # and ensure not } after
 393        if (lexer->lookahead == '#') {
 394            lexer->mark_end(lexer);
 395            advance(lexer);
 396            if (lexer->lookahead == '#') {
 397                advance(lexer);
 398                if (lexer->lookahead != '}') {
 399                    lexer->result_symbol = IMMEDIATE_DOUBLE_HASH;
 400                    lexer->mark_end(lexer);
 401                    return true;
 402                }
 403            }
 404        }
 405    }
 406
 407    if (valid_symbols[EXTERNAL_EXPANSION_SYM_HASH] && !in_error_recovery(valid_symbols)) {
 408        if (lexer->lookahead == '#' || lexer->lookahead == '=' || lexer->lookahead == '!') {
 409            lexer->result_symbol = lexer->lookahead == '#'   ? EXTERNAL_EXPANSION_SYM_HASH
 410                                   : lexer->lookahead == '!' ? EXTERNAL_EXPANSION_SYM_BANG
 411                                                             : EXTERNAL_EXPANSION_SYM_EQUAL;
 412            advance(lexer);
 413            lexer->mark_end(lexer);
 414            while (lexer->lookahead == '#' || lexer->lookahead == '=' || lexer->lookahead == '!') {
 415                advance(lexer);
 416            }
 417            while (iswspace(lexer->lookahead)) {
 418                skip(lexer);
 419            }
 420            if (lexer->lookahead == '}') {
 421                return true;
 422            }
 423            return false;
 424        }
 425    }
 426
 427    if (valid_symbols[EMPTY_VALUE]) {
 428        if (iswspace(lexer->lookahead) || lexer->eof(lexer) || lexer->lookahead == ';' || lexer->lookahead == '&') {
 429            lexer->result_symbol = EMPTY_VALUE;
 430            return true;
 431        }
 432    }
 433
 434    if ((valid_symbols[HEREDOC_BODY_BEGINNING] || valid_symbols[SIMPLE_HEREDOC_BODY]) && scanner->heredocs.size > 0 &&
 435        !array_back(&scanner->heredocs)->started && !in_error_recovery(valid_symbols)) {
 436        return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING, SIMPLE_HEREDOC_BODY);
 437    }
 438
 439    if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) {
 440        Heredoc *heredoc = array_back(&scanner->heredocs);
 441        if (scan_heredoc_end_identifier(heredoc, lexer)) {
 442            array_delete(&heredoc->current_leading_word);
 443            array_delete(&heredoc->delimiter);
 444            array_pop(&scanner->heredocs);
 445            lexer->result_symbol = HEREDOC_END;
 446            return true;
 447        }
 448    }
 449
 450    if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 && array_back(&scanner->heredocs)->started &&
 451        !in_error_recovery(valid_symbols)) {
 452        return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT, HEREDOC_END);
 453    }
 454
 455    if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) && scanner->heredocs.size > 0) {
 456        return scan_heredoc_start(array_back(&scanner->heredocs), lexer);
 457    }
 458
 459    if (valid_symbols[TEST_OPERATOR] && !valid_symbols[EXPANSION_WORD]) {
 460        while (iswspace(lexer->lookahead) && lexer->lookahead != '\n') {
 461            skip(lexer);
 462        }
 463
 464        if (lexer->lookahead == '\\') {
 465            if (valid_symbols[EXTGLOB_PATTERN]) {
 466                goto extglob_pattern;
 467            }
 468            if (valid_symbols[REGEX_NO_SPACE]) {
 469                goto regex;
 470            }
 471            skip(lexer);
 472
 473            if (lexer->eof(lexer)) {
 474                return false;
 475            }
 476
 477            if (lexer->lookahead == '\r') {
 478                skip(lexer);
 479                if (lexer->lookahead == '\n') {
 480                    skip(lexer);
 481                }
 482            } else if (lexer->lookahead == '\n') {
 483                skip(lexer);
 484            } else {
 485                return false;
 486            }
 487
 488            while (iswspace(lexer->lookahead)) {
 489                skip(lexer);
 490            }
 491        }
 492
 493        if (lexer->lookahead == '\n' && !valid_symbols[NEWLINE]) {
 494            skip(lexer);
 495
 496            while (iswspace(lexer->lookahead)) {
 497                skip(lexer);
 498            }
 499        }
 500
 501        if (lexer->lookahead == '-') {
 502            advance(lexer);
 503
 504            bool advanced_once = false;
 505            while (iswalpha(lexer->lookahead)) {
 506                advanced_once = true;
 507                advance(lexer);
 508            }
 509
 510            if (iswspace(lexer->lookahead) && advanced_once) {
 511                lexer->mark_end(lexer);
 512                advance(lexer);
 513                if (lexer->lookahead == '}' && valid_symbols[CLOSING_BRACE]) {
 514                    if (valid_symbols[EXPANSION_WORD]) {
 515                        lexer->mark_end(lexer);
 516                        lexer->result_symbol = EXPANSION_WORD;
 517                        return true;
 518                    }
 519                    return false;
 520                }
 521                lexer->result_symbol = TEST_OPERATOR;
 522                return true;
 523            }
 524            if (iswspace(lexer->lookahead) && valid_symbols[EXTGLOB_PATTERN]) {
 525                lexer->result_symbol = EXTGLOB_PATTERN;
 526                return true;
 527            }
 528        }
 529
 530        if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) {
 531            return true;
 532        }
 533    }
 534
 535    if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] || valid_symbols[HEREDOC_ARROW]) &&
 536        !valid_symbols[REGEX_NO_SLASH] && !in_error_recovery(valid_symbols)) {
 537        for (;;) {
 538            if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\r' ||
 539                 (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) &&
 540                !valid_symbols[EXPANSION_WORD]) {
 541                skip(lexer);
 542            } else if (lexer->lookahead == '\\') {
 543                skip(lexer);
 544
 545                if (lexer->eof(lexer)) {
 546                    lexer->mark_end(lexer);
 547                    lexer->result_symbol = VARIABLE_NAME;
 548                    return true;
 549                }
 550
 551                if (lexer->lookahead == '\r') {
 552                    skip(lexer);
 553                }
 554                if (lexer->lookahead == '\n') {
 555                    skip(lexer);
 556                } else {
 557                    if (lexer->lookahead == '\\' && valid_symbols[EXPANSION_WORD]) {
 558                        goto expansion_word;
 559                    }
 560                    return false;
 561                }
 562            } else {
 563                break;
 564            }
 565        }
 566
 567        // no '*', '@', '?', '-', '$', '0', '_'
 568        if (!valid_symbols[EXPANSION_WORD] &&
 569            (lexer->lookahead == '*' || lexer->lookahead == '@' || lexer->lookahead == '?' || lexer->lookahead == '-' ||
 570             lexer->lookahead == '0' || lexer->lookahead == '_')) {
 571            lexer->mark_end(lexer);
 572            advance(lexer);
 573            if (lexer->lookahead == '=' || lexer->lookahead == '[' || lexer->lookahead == ':' ||
 574                lexer->lookahead == '-' || lexer->lookahead == '%' || lexer->lookahead == '#' ||
 575                lexer->lookahead == '/') {
 576                return false;
 577            }
 578            if (valid_symbols[EXTGLOB_PATTERN] && iswspace(lexer->lookahead)) {
 579                lexer->mark_end(lexer);
 580                lexer->result_symbol = EXTGLOB_PATTERN;
 581                return true;
 582            }
 583        }
 584
 585        if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') {
 586            advance(lexer);
 587            if (lexer->lookahead == '<') {
 588                advance(lexer);
 589                if (lexer->lookahead == '-') {
 590                    advance(lexer);
 591                    Heredoc heredoc = heredoc_new();
 592                    heredoc.allows_indent = true;
 593                    array_push(&scanner->heredocs, heredoc);
 594                    lexer->result_symbol = HEREDOC_ARROW_DASH;
 595                } else if (lexer->lookahead == '<' || lexer->lookahead == '=') {
 596                    return false;
 597                } else {
 598                    Heredoc heredoc = heredoc_new();
 599                    array_push(&scanner->heredocs, heredoc);
 600                    lexer->result_symbol = HEREDOC_ARROW;
 601                }
 602                return true;
 603            }
 604            return false;
 605        }
 606
 607        bool is_number = true;
 608        if (iswdigit(lexer->lookahead)) {
 609            advance(lexer);
 610        } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
 611            is_number = false;
 612            advance(lexer);
 613        } else {
 614            if (lexer->lookahead == '{') {
 615                goto brace_start;
 616            }
 617            if (valid_symbols[EXPANSION_WORD]) {
 618                goto expansion_word;
 619            }
 620            if (valid_symbols[EXTGLOB_PATTERN]) {
 621                goto extglob_pattern;
 622            }
 623            return false;
 624        }
 625
 626        for (;;) {
 627            if (iswdigit(lexer->lookahead)) {
 628                advance(lexer);
 629            } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
 630                is_number = false;
 631                advance(lexer);
 632            } else {
 633                break;
 634            }
 635        }
 636
 637        if (is_number && valid_symbols[FILE_DESCRIPTOR] && (lexer->lookahead == '>' || lexer->lookahead == '<')) {
 638            lexer->result_symbol = FILE_DESCRIPTOR;
 639            return true;
 640        }
 641
 642        if (valid_symbols[VARIABLE_NAME]) {
 643            if (lexer->lookahead == '+') {
 644                lexer->mark_end(lexer);
 645                advance(lexer);
 646                if (lexer->lookahead == '=' || lexer->lookahead == ':' || valid_symbols[CLOSING_BRACE]) {
 647                    lexer->result_symbol = VARIABLE_NAME;
 648                    return true;
 649                }
 650                return false;
 651            }
 652            if (lexer->lookahead == '/') {
 653                return false;
 654            }
 655            if (lexer->lookahead == '=' || lexer->lookahead == '[' ||
 656                (lexer->lookahead == ':' && !valid_symbols[CLOSING_BRACE] &&
 657                 !valid_symbols[OPENING_PAREN]) || // TODO(amaanq): more cases for regular word chars but not variable
 658                                                   // names for function words, only handling : for now? #235
 659                lexer->lookahead == '%' ||
 660                (lexer->lookahead == '#' && !is_number) || lexer->lookahead == '@' ||
 661                (lexer->lookahead == '-' && valid_symbols[CLOSING_BRACE])) {
 662                lexer->mark_end(lexer);
 663                lexer->result_symbol = VARIABLE_NAME;
 664                return true;
 665            }
 666
 667            if (lexer->lookahead == '?') {
 668                lexer->mark_end(lexer);
 669                advance(lexer);
 670                lexer->result_symbol = VARIABLE_NAME;
 671                return iswalpha(lexer->lookahead);
 672            }
 673        }
 674
 675        return false;
 676    }
 677
 678    if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) && scan_bare_dollar(lexer)) {
 679        return true;
 680    }
 681
 682regex:
 683    if ((valid_symbols[REGEX] || valid_symbols[REGEX_NO_SLASH] || valid_symbols[REGEX_NO_SPACE]) &&
 684        !in_error_recovery(valid_symbols)) {
 685        if (valid_symbols[REGEX] || valid_symbols[REGEX_NO_SPACE]) {
 686            while (iswspace(lexer->lookahead)) {
 687                skip(lexer);
 688            }
 689        }
 690
 691        if ((lexer->lookahead != '"' && lexer->lookahead != '\'') ||
 692            ((lexer->lookahead == '$' || lexer->lookahead == '\'') && valid_symbols[REGEX_NO_SLASH]) ||
 693            (lexer->lookahead == '\'' && valid_symbols[REGEX_NO_SPACE])) {
 694            typedef struct {
 695                bool done;
 696                bool advanced_once;
 697                bool found_non_alnumdollarunderdash;
 698                bool last_was_escape;
 699                bool in_single_quote;
 700                uint32_t paren_depth;
 701                uint32_t bracket_depth;
 702                uint32_t brace_depth;
 703            } State;
 704
 705            if (lexer->lookahead == '$' && valid_symbols[REGEX_NO_SLASH]) {
 706                lexer->mark_end(lexer);
 707                advance(lexer);
 708                if (lexer->lookahead == '(') {
 709                    return false;
 710                }
 711            }
 712
 713            lexer->mark_end(lexer);
 714
 715            State state = {false, false, false, false, false, 0, 0, 0};
 716            while (!state.done) {
 717                if (state.in_single_quote) {
 718                    if (lexer->lookahead == '\'') {
 719                        state.in_single_quote = false;
 720                        advance(lexer);
 721                        lexer->mark_end(lexer);
 722                    }
 723                }
 724                switch (lexer->lookahead) {
 725                    case '\\':
 726                        state.last_was_escape = true;
 727                        break;
 728                    case '\0':
 729                        return false;
 730                    case '(':
 731                        state.paren_depth++;
 732                        state.last_was_escape = false;
 733                        break;
 734                    case '[':
 735                        state.bracket_depth++;
 736                        state.last_was_escape = false;
 737                        break;
 738                    case '{':
 739                        if (!state.last_was_escape) {
 740                            state.brace_depth++;
 741                        }
 742                        state.last_was_escape = false;
 743                        break;
 744                    case ')':
 745                        if (state.paren_depth == 0) {
 746                            state.done = true;
 747                        }
 748                        state.paren_depth--;
 749                        state.last_was_escape = false;
 750                        break;
 751                    case ']':
 752                        if (state.bracket_depth == 0) {
 753                            state.done = true;
 754                        }
 755                        state.bracket_depth--;
 756                        state.last_was_escape = false;
 757                        break;
 758                    case '}':
 759                        if (state.brace_depth == 0) {
 760                            state.done = true;
 761                        }
 762                        state.brace_depth--;
 763                        state.last_was_escape = false;
 764                        break;
 765                    case '\'':
 766                        // Enter or exit a single-quoted string.
 767                        state.in_single_quote = !state.in_single_quote;
 768                        advance(lexer);
 769                        state.advanced_once = true;
 770                        state.last_was_escape = false;
 771                        continue;
 772                    default:
 773                        state.last_was_escape = false;
 774                        break;
 775                }
 776
 777                if (!state.done) {
 778                    if (valid_symbols[REGEX]) {
 779                        bool was_space = !state.in_single_quote && iswspace(lexer->lookahead);
 780                        advance(lexer);
 781                        state.advanced_once = true;
 782                        if (!was_space || state.paren_depth > 0) {
 783                            lexer->mark_end(lexer);
 784                        }
 785                    } else if (valid_symbols[REGEX_NO_SLASH]) {
 786                        if (lexer->lookahead == '/') {
 787                            lexer->mark_end(lexer);
 788                            lexer->result_symbol = REGEX_NO_SLASH;
 789                            return state.advanced_once;
 790                        }
 791                        if (lexer->lookahead == '\\') {
 792                            advance(lexer);
 793                            state.advanced_once = true;
 794                            if (!lexer->eof(lexer) && lexer->lookahead != '[' && lexer->lookahead != '/') {
 795                                advance(lexer);
 796                                lexer->mark_end(lexer);
 797                            }
 798                        } else {
 799                            bool was_space = !state.in_single_quote && iswspace(lexer->lookahead);
 800                            advance(lexer);
 801                            state.advanced_once = true;
 802                            if (!was_space) {
 803                                lexer->mark_end(lexer);
 804                            }
 805                        }
 806                    } else if (valid_symbols[REGEX_NO_SPACE]) {
 807                        if (lexer->lookahead == '\\') {
 808                            state.found_non_alnumdollarunderdash = true;
 809                            advance(lexer);
 810                            if (!lexer->eof(lexer)) {
 811                                advance(lexer);
 812                            }
 813                        } else if (lexer->lookahead == '$') {
 814                            lexer->mark_end(lexer);
 815                            advance(lexer);
 816                            // do not parse a command
 817                            // substitution
 818                            if (lexer->lookahead == '(') {
 819                                return false;
 820                            }
 821                            // end $ always means regex, e.g.
 822                            // 99999999$
 823                            if (iswspace(lexer->lookahead)) {
 824                                lexer->result_symbol = REGEX_NO_SPACE;
 825                                lexer->mark_end(lexer);
 826                                return true;
 827                            }
 828                        } else {
 829                            bool was_space = !state.in_single_quote && iswspace(lexer->lookahead);
 830                            if (was_space && state.paren_depth == 0) {
 831                                lexer->mark_end(lexer);
 832                                lexer->result_symbol = REGEX_NO_SPACE;
 833                                return state.found_non_alnumdollarunderdash;
 834                            }
 835                            if (!iswalnum(lexer->lookahead) && lexer->lookahead != '$' && lexer->lookahead != '-' &&
 836                                lexer->lookahead != '_') {
 837                                state.found_non_alnumdollarunderdash = true;
 838                            }
 839                            advance(lexer);
 840                        }
 841                    }
 842                }
 843            }
 844
 845            lexer->result_symbol = valid_symbols[REGEX_NO_SLASH]   ? REGEX_NO_SLASH
 846                                   : valid_symbols[REGEX_NO_SPACE] ? REGEX_NO_SPACE
 847                                                                   : REGEX;
 848            if (valid_symbols[REGEX] && !state.advanced_once) {
 849                return false;
 850            }
 851            return true;
 852        }
 853    }
 854
 855extglob_pattern:
 856    if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols)) {
 857        // first skip ws, then check for ? * + @ !
 858        while (iswspace(lexer->lookahead)) {
 859            skip(lexer);
 860        }
 861
 862        if (lexer->lookahead == '?' || lexer->lookahead == '*' || lexer->lookahead == '+' || lexer->lookahead == '@' ||
 863            lexer->lookahead == '!' || lexer->lookahead == '-' || lexer->lookahead == ')' || lexer->lookahead == '\\' ||
 864            lexer->lookahead == '.' || lexer->lookahead == '[' || (iswalpha(lexer->lookahead))) {
 865            if (lexer->lookahead == '\\') {
 866                advance(lexer);
 867                if ((iswspace(lexer->lookahead) || lexer->lookahead == '"') && lexer->lookahead != '\r' &&
 868                    lexer->lookahead != '\n') {
 869                    advance(lexer);
 870                } else {
 871                    return false;
 872                }
 873            }
 874
 875            if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) {
 876                lexer->mark_end(lexer);
 877                advance(lexer);
 878
 879                if (iswspace(lexer->lookahead)) {
 880                    return false;
 881                }
 882            }
 883
 884            lexer->mark_end(lexer);
 885            bool was_non_alpha = !iswalpha(lexer->lookahead);
 886            if (lexer->lookahead != '[') {
 887                // no esac
 888                if (lexer->lookahead == 'e') {
 889                    lexer->mark_end(lexer);
 890                    advance(lexer);
 891                    if (lexer->lookahead == 's') {
 892                        advance(lexer);
 893                        if (lexer->lookahead == 'a') {
 894                            advance(lexer);
 895                            if (lexer->lookahead == 'c') {
 896                                advance(lexer);
 897                                if (iswspace(lexer->lookahead)) {
 898                                    return false;
 899                                }
 900                            }
 901                        }
 902                    }
 903                } else {
 904                    advance(lexer);
 905                }
 906            }
 907
 908            // -\w is just a word, find something else special
 909            if (lexer->lookahead == '-') {
 910                lexer->mark_end(lexer);
 911                advance(lexer);
 912                while (iswalnum(lexer->lookahead)) {
 913                    advance(lexer);
 914                }
 915
 916                if (lexer->lookahead == ')' || lexer->lookahead == '\\' || lexer->lookahead == '.') {
 917                    return false;
 918                }
 919                lexer->mark_end(lexer);
 920            }
 921
 922            // case item -) or *)
 923            if (lexer->lookahead == ')' && scanner->last_glob_paren_depth == 0) {
 924                lexer->mark_end(lexer);
 925                advance(lexer);
 926                if (iswspace(lexer->lookahead)) {
 927                    lexer->result_symbol = EXTGLOB_PATTERN;
 928                    return was_non_alpha;
 929                }
 930            }
 931
 932            if (iswspace(lexer->lookahead)) {
 933                lexer->mark_end(lexer);
 934                lexer->result_symbol = EXTGLOB_PATTERN;
 935                scanner->last_glob_paren_depth = 0;
 936                return true;
 937            }
 938
 939            if (lexer->lookahead == '$') {
 940                lexer->mark_end(lexer);
 941                advance(lexer);
 942                if (lexer->lookahead == '{' || lexer->lookahead == '(') {
 943                    lexer->result_symbol = EXTGLOB_PATTERN;
 944                    return true;
 945                }
 946            }
 947
 948            if (lexer->lookahead == '|') {
 949                lexer->mark_end(lexer);
 950                advance(lexer);
 951                lexer->result_symbol = EXTGLOB_PATTERN;
 952                return true;
 953            }
 954
 955            if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' && lexer->lookahead != '"' &&
 956                lexer->lookahead != '[' && lexer->lookahead != '?' && lexer->lookahead != '/' &&
 957                lexer->lookahead != '\\' && lexer->lookahead != '_' && lexer->lookahead != '*') {
 958                return false;
 959            }
 960
 961            typedef struct {
 962                bool done;
 963                bool saw_non_alphadot;
 964                uint32_t paren_depth;
 965                uint32_t bracket_depth;
 966                uint32_t brace_depth;
 967            } State;
 968
 969            State state = {false, was_non_alpha, scanner->last_glob_paren_depth, 0, 0};
 970            while (!state.done) {
 971                switch (lexer->lookahead) {
 972                    case '\0':
 973                        return false;
 974                    case '(':
 975                        state.paren_depth++;
 976                        break;
 977                    case '[':
 978                        state.bracket_depth++;
 979                        break;
 980                    case '{':
 981                        state.brace_depth++;
 982                        break;
 983                    case ')':
 984                        if (state.paren_depth == 0) {
 985                            state.done = true;
 986                        }
 987                        state.paren_depth--;
 988                        break;
 989                    case ']':
 990                        if (state.bracket_depth == 0) {
 991                            state.done = true;
 992                        }
 993                        state.bracket_depth--;
 994                        break;
 995                    case '}':
 996                        if (state.brace_depth == 0) {
 997                            state.done = true;
 998                        }
 999                        state.brace_depth--;
1000                        break;
1001                }
1002
1003                if (lexer->lookahead == '|') {
1004                    lexer->mark_end(lexer);
1005                    advance(lexer);
1006                    if (state.paren_depth == 0 && state.bracket_depth == 0 && state.brace_depth == 0) {
1007                        lexer->result_symbol = EXTGLOB_PATTERN;
1008                        return true;
1009                    }
1010                }
1011
1012                if (!state.done) {
1013                    bool was_space = iswspace(lexer->lookahead);
1014                    if (lexer->lookahead == '$') {
1015                        lexer->mark_end(lexer);
1016                        if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') {
1017                            state.saw_non_alphadot = true;
1018                        }
1019                        advance(lexer);
1020                        if (lexer->lookahead == '(' || lexer->lookahead == '{') {
1021                            lexer->result_symbol = EXTGLOB_PATTERN;
1022                            scanner->last_glob_paren_depth = state.paren_depth;
1023                            return state.saw_non_alphadot;
1024                        }
1025                    }
1026                    if (was_space) {
1027                        lexer->mark_end(lexer);
1028                        lexer->result_symbol = EXTGLOB_PATTERN;
1029                        scanner->last_glob_paren_depth = 0;
1030                        return state.saw_non_alphadot;
1031                    }
1032                    if (lexer->lookahead == '"') {
1033                        lexer->mark_end(lexer);
1034                        lexer->result_symbol = EXTGLOB_PATTERN;
1035                        scanner->last_glob_paren_depth = 0;
1036                        return state.saw_non_alphadot;
1037                    }
1038                    if (lexer->lookahead == '\\') {
1039                        if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') {
1040                            state.saw_non_alphadot = true;
1041                        }
1042                        advance(lexer);
1043                        if (iswspace(lexer->lookahead) || lexer->lookahead == '"') {
1044                            advance(lexer);
1045                        }
1046                    } else {
1047                        if (!iswalpha(lexer->lookahead) && lexer->lookahead != '.' && lexer->lookahead != '\\') {
1048                            state.saw_non_alphadot = true;
1049                        }
1050                        advance(lexer);
1051                    }
1052                    if (!was_space) {
1053                        lexer->mark_end(lexer);
1054                    }
1055                }
1056            }
1057
1058            lexer->result_symbol = EXTGLOB_PATTERN;
1059            scanner->last_glob_paren_depth = 0;
1060            return state.saw_non_alphadot;
1061        }
1062        scanner->last_glob_paren_depth = 0;
1063
1064        return false;
1065    }
1066
1067expansion_word:
1068    if (valid_symbols[EXPANSION_WORD]) {
1069        bool advanced_once = false;
1070        bool advance_once_space = false;
1071        for (;;) {
1072            if (lexer->lookahead == '\"') {
1073                return false;
1074            }
1075            if (lexer->lookahead == '$') {
1076                lexer->mark_end(lexer);
1077                advance(lexer);
1078                if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' ||
1079                    iswalnum(lexer->lookahead)) {
1080                    lexer->result_symbol = EXPANSION_WORD;
1081                    return advanced_once;
1082                }
1083                advanced_once = true;
1084            }
1085
1086            if (lexer->lookahead == '}') {
1087                lexer->mark_end(lexer);
1088                lexer->result_symbol = EXPANSION_WORD;
1089                return advanced_once || advance_once_space;
1090            }
1091
1092            if (lexer->lookahead == '(' && !(advanced_once || advance_once_space)) {
1093                lexer->mark_end(lexer);
1094                advance(lexer);
1095                while (lexer->lookahead != ')' && !lexer->eof(lexer)) {
1096                    // if we find a $( or ${ assume this is valid and is
1097                    // a garbage concatenation of some weird word + an
1098                    // expansion
1099                    // I wonder where this can fail
1100                    if (lexer->lookahead == '$') {
1101                        lexer->mark_end(lexer);
1102                        advance(lexer);
1103                        if (lexer->lookahead == '{' || lexer->lookahead == '(' || lexer->lookahead == '\'' ||
1104                            iswalnum(lexer->lookahead)) {
1105                            lexer->result_symbol = EXPANSION_WORD;
1106                            return advanced_once;
1107                        }
1108                        advanced_once = true;
1109                    } else {
1110                        advanced_once = advanced_once || !iswspace(lexer->lookahead);
1111                        advance_once_space = advance_once_space || iswspace(lexer->lookahead);
1112                        advance(lexer);
1113                    }
1114                }
1115                lexer->mark_end(lexer);
1116                if (lexer->lookahead == ')') {
1117                    advanced_once = true;
1118                    advance(lexer);
1119                    lexer->mark_end(lexer);
1120                    if (lexer->lookahead == '}') {
1121                        return false;
1122                    }
1123                } else {
1124                    return false;
1125                }
1126            }
1127
1128            if (lexer->lookahead == '\'') {
1129                return false;
1130            }
1131
1132            if (lexer->eof(lexer)) {
1133                return false;
1134            }
1135            advanced_once = advanced_once || !iswspace(lexer->lookahead);
1136            advance_once_space = advance_once_space || iswspace(lexer->lookahead);
1137            advance(lexer);
1138        }
1139    }
1140
1141brace_start:
1142    if (valid_symbols[BRACE_START] && !in_error_recovery(valid_symbols)) {
1143        while (iswspace(lexer->lookahead)) {
1144            skip(lexer);
1145        }
1146
1147        if (lexer->lookahead != '{') {
1148            return false;
1149        }
1150
1151        advance(lexer);
1152        lexer->mark_end(lexer);
1153
1154        while (isdigit(lexer->lookahead)) {
1155            advance(lexer);
1156        }
1157
1158        if (lexer->lookahead != '.') {
1159            return false;
1160        }
1161        advance(lexer);
1162
1163        if (lexer->lookahead != '.') {
1164            return false;
1165        }
1166        advance(lexer);
1167
1168        while (isdigit(lexer->lookahead)) {
1169            advance(lexer);
1170        }
1171
1172        if (lexer->lookahead != '}') {
1173            return false;
1174        }
1175
1176        lexer->result_symbol = BRACE_START;
1177        return true;
1178    }
1179
1180    return false;
1181}
1182
1183void *tree_sitter_bash_external_scanner_create() {
1184    Scanner *scanner = calloc(1, sizeof(Scanner));
1185    array_init(&scanner->heredocs);
1186    return scanner;
1187}
1188
1189bool tree_sitter_bash_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
1190    Scanner *scanner = (Scanner *)payload;
1191    return scan(scanner, lexer, valid_symbols);
1192}
1193
1194unsigned tree_sitter_bash_external_scanner_serialize(void *payload, char *state) {
1195    Scanner *scanner = (Scanner *)payload;
1196    return serialize(scanner, state);
1197}
1198
1199void tree_sitter_bash_external_scanner_deserialize(void *payload, const char *state, unsigned length) {
1200    Scanner *scanner = (Scanner *)payload;
1201    deserialize(scanner, state, length);
1202}
1203
1204void tree_sitter_bash_external_scanner_destroy(void *payload) {
1205    Scanner *scanner = (Scanner *)payload;
1206    for (size_t i = 0; i < scanner->heredocs.size; i++) {
1207        Heredoc *heredoc = array_get(&scanner->heredocs, i);
1208        array_delete(&heredoc->current_leading_word);
1209        array_delete(&heredoc->delimiter);
1210    }
1211    array_delete(&scanner->heredocs);
1212    free(scanner);
1213}