1#include "parser.h"
   2#include <assert.h>
   3#include <ctype.h>
   4#include <string.h>
   5#include <wchar.h>
   6#include <wctype.h>
   7
   8// For explanation of the tokens see grammar.js
   9typedef enum {
  10    LINE_ENDING,
  11    SOFT_LINE_ENDING,
  12    BLOCK_CLOSE,
  13    BLOCK_CONTINUATION,
  14    BLOCK_QUOTE_START,
  15    INDENTED_CHUNK_START,
  16    ATX_H1_MARKER,
  17    ATX_H2_MARKER,
  18    ATX_H3_MARKER,
  19    ATX_H4_MARKER,
  20    ATX_H5_MARKER,
  21    ATX_H6_MARKER,
  22    SETEXT_H1_UNDERLINE,
  23    SETEXT_H2_UNDERLINE,
  24    THEMATIC_BREAK,
  25    LIST_MARKER_MINUS,
  26    LIST_MARKER_PLUS,
  27    LIST_MARKER_STAR,
  28    LIST_MARKER_PARENTHESIS,
  29    LIST_MARKER_DOT,
  30    LIST_MARKER_MINUS_DONT_INTERRUPT,
  31    LIST_MARKER_PLUS_DONT_INTERRUPT,
  32    LIST_MARKER_STAR_DONT_INTERRUPT,
  33    LIST_MARKER_PARENTHESIS_DONT_INTERRUPT,
  34    LIST_MARKER_DOT_DONT_INTERRUPT,
  35    FENCED_CODE_BLOCK_START_BACKTICK,
  36    FENCED_CODE_BLOCK_START_TILDE,
  37    BLANK_LINE_START,
  38    FENCED_CODE_BLOCK_END_BACKTICK,
  39    FENCED_CODE_BLOCK_END_TILDE,
  40    HTML_BLOCK_1_START,
  41    HTML_BLOCK_1_END,
  42    HTML_BLOCK_2_START,
  43    HTML_BLOCK_3_START,
  44    HTML_BLOCK_4_START,
  45    HTML_BLOCK_5_START,
  46    HTML_BLOCK_6_START,
  47    HTML_BLOCK_7_START,
  48    CLOSE_BLOCK,
  49    NO_INDENTED_CHUNK,
  50    ERROR,
  51    TRIGGER_ERROR,
  52    TOKEN_EOF,
  53    MINUS_METADATA,
  54    PLUS_METADATA,
  55    PIPE_TABLE_START,
  56    PIPE_TABLE_LINE_ENDING,
  57} TokenType;
  58
  59// Description of a block on the block stack.
  60//
  61// LIST_ITEM is a list item with minimal indentation (content begins at indent
  62// level 2) while LIST_ITEM_MAX_INDENTATION represents a list item with maximal
  63// indentation without being considered a indented code block.
  64//
  65// ANONYMOUS represents any block that whose close is not handled by the
  66// external s.
  67typedef enum {
  68    BLOCK_QUOTE,
  69    INDENTED_CODE_BLOCK,
  70    LIST_ITEM,
  71    LIST_ITEM_1_INDENTATION,
  72    LIST_ITEM_2_INDENTATION,
  73    LIST_ITEM_3_INDENTATION,
  74    LIST_ITEM_4_INDENTATION,
  75    LIST_ITEM_5_INDENTATION,
  76    LIST_ITEM_6_INDENTATION,
  77    LIST_ITEM_7_INDENTATION,
  78    LIST_ITEM_8_INDENTATION,
  79    LIST_ITEM_9_INDENTATION,
  80    LIST_ITEM_10_INDENTATION,
  81    LIST_ITEM_11_INDENTATION,
  82    LIST_ITEM_12_INDENTATION,
  83    LIST_ITEM_13_INDENTATION,
  84    LIST_ITEM_14_INDENTATION,
  85    LIST_ITEM_MAX_INDENTATION,
  86    FENCED_CODE_BLOCK,
  87    ANONYMOUS,
  88} Block;
  89
  90// Determines if a character is punctuation as defined by the markdown spec.
  91static bool is_punctuation(char chr) {
  92    return (chr >= '!' && chr <= '/') || (chr >= ':' && chr <= '@') ||
  93           (chr >= '[' && chr <= '`') || (chr >= '{' && chr <= '~');
  94}
  95
  96// Returns the indentation level which lines of a list item should have at
  97// minimum. Should only be called with blocks for which `is_list_item` returns
  98// true.
  99static uint8_t list_item_indentation(Block block) {
 100    return (uint8_t)(block - LIST_ITEM + 2);
 101}
 102
 103#define NUM_HTML_TAG_NAMES_RULE_1 3
 104
 105static const char *const HTML_TAG_NAMES_RULE_1[NUM_HTML_TAG_NAMES_RULE_1] = {
 106    "pre", "script", "style"};
 107
 108#define NUM_HTML_TAG_NAMES_RULE_7 62
 109
 110static const char *const HTML_TAG_NAMES_RULE_7[NUM_HTML_TAG_NAMES_RULE_7] = {
 111    "address",  "article",    "aside",  "base",     "basefont", "blockquote",
 112    "body",     "caption",    "center", "col",      "colgroup", "dd",
 113    "details",  "dialog",     "dir",    "div",      "dl",       "dt",
 114    "fieldset", "figcaption", "figure", "footer",   "form",     "frame",
 115    "frameset", "h1",         "h2",     "h3",       "h4",       "h5",
 116    "h6",       "head",       "header", "hr",       "html",     "iframe",
 117    "legend",   "li",         "link",   "main",     "menu",     "menuitem",
 118    "nav",      "noframes",   "ol",     "optgroup", "option",   "p",
 119    "param",    "section",    "source", "summary",  "table",    "tbody",
 120    "td",       "tfoot",      "th",     "thead",    "title",    "tr",
 121    "track",    "ul"};
 122
 123// For explanation of the tokens see grammar.js
 124static const bool paragraph_interrupt_symbols[] = {
 125    false, // LINE_ENDING,
 126    false, // SOFT_LINE_ENDING,
 127    false, // BLOCK_CLOSE,
 128    false, // BLOCK_CONTINUATION,
 129    true,  // BLOCK_QUOTE_START,
 130    false, // INDENTED_CHUNK_START,
 131    true,  // ATX_H1_MARKER,
 132    true,  // ATX_H2_MARKER,
 133    true,  // ATX_H3_MARKER,
 134    true,  // ATX_H4_MARKER,
 135    true,  // ATX_H5_MARKER,
 136    true,  // ATX_H6_MARKER,
 137    true,  // SETEXT_H1_UNDERLINE,
 138    true,  // SETEXT_H2_UNDERLINE,
 139    true,  // THEMATIC_BREAK,
 140    true,  // LIST_MARKER_MINUS,
 141    true,  // LIST_MARKER_PLUS,
 142    true,  // LIST_MARKER_STAR,
 143    true,  // LIST_MARKER_PARENTHESIS,
 144    true,  // LIST_MARKER_DOT,
 145    false, // LIST_MARKER_MINUS_DONT_INTERRUPT,
 146    false, // LIST_MARKER_PLUS_DONT_INTERRUPT,
 147    false, // LIST_MARKER_STAR_DONT_INTERRUPT,
 148    false, // LIST_MARKER_PARENTHESIS_DONT_INTERRUPT,
 149    false, // LIST_MARKER_DOT_DONT_INTERRUPT,
 150    true,  // FENCED_CODE_BLOCK_START_BACKTICK,
 151    true,  // FENCED_CODE_BLOCK_START_TILDE,
 152    true,  // BLANK_LINE_START,
 153    false, // FENCED_CODE_BLOCK_END_BACKTICK,
 154    false, // FENCED_CODE_BLOCK_END_TILDE,
 155    true,  // HTML_BLOCK_1_START,
 156    false, // HTML_BLOCK_1_END,
 157    true,  // HTML_BLOCK_2_START,
 158    true,  // HTML_BLOCK_3_START,
 159    true,  // HTML_BLOCK_4_START,
 160    true,  // HTML_BLOCK_5_START,
 161    true,  // HTML_BLOCK_6_START,
 162    false, // HTML_BLOCK_7_START,
 163    false, // CLOSE_BLOCK,
 164    false, // NO_INDENTED_CHUNK,
 165    false, // ERROR,
 166    false, // TRIGGER_ERROR,
 167    false, // EOF,
 168    false, // MINUS_METADATA,
 169    false, // PLUS_METADATA,
 170    true,  // PIPE_TABLE_START,
 171    false, // PIPE_TABLE_LINE_ENDING,
 172};
 173
 174// State bitflags used with `Scanner.state`
 175
 176// Currently matching (at the beginning of a line)
 177static const uint8_t STATE_MATCHING = 0x1 << 0;
 178// Last line break was inside a paragraph
 179static const uint8_t STATE_WAS_SOFT_LINE_BREAK = 0x1 << 1;
 180// Block should be closed after next line break
 181static const uint8_t STATE_CLOSE_BLOCK = 0x1 << 4;
 182
 183static size_t roundup_32(size_t x) {
 184    x--;
 185
 186    x |= x >> 1;
 187    x |= x >> 2;
 188    x |= x >> 4;
 189    x |= x >> 8;
 190    x |= x >> 16;
 191
 192    x++;
 193
 194    return x;
 195}
 196
 197typedef struct {
 198    // A stack of open blocks in the current parse state
 199    struct {
 200        size_t size;
 201        size_t capacity;
 202        Block *items;
 203    } open_blocks;
 204
 205    // Parser state flags
 206    uint8_t state;
 207    // Number of blocks that have been matched so far. Only changes during
 208    // matching and is reset after every line ending
 209    uint8_t matched;
 210    // Consumed but "unused" indentation. Sometimes a tab needs to be "split" to
 211    // be used in multiple tokens.
 212    uint8_t indentation;
 213    // The current column. Used to decide how many spaces a tab should equal
 214    uint8_t column;
 215    // The delimiter length of the currently open fenced code block
 216    uint8_t fenced_code_block_delimiter_length;
 217
 218    bool simulate;
 219} Scanner;
 220
 221static void push_block(Scanner *s, Block b) {
 222    if (s->open_blocks.size == s->open_blocks.capacity) {
 223        s->open_blocks.capacity =
 224            s->open_blocks.capacity ? s->open_blocks.capacity << 1 : 8;
 225        void *tmp = realloc(s->open_blocks.items,
 226                            sizeof(Block) * s->open_blocks.capacity);
 227        assert(tmp != NULL);
 228        s->open_blocks.items = tmp;
 229    }
 230
 231    s->open_blocks.items[s->open_blocks.size++] = b;
 232}
 233
 234static inline Block pop_block(Scanner *s) {
 235    return s->open_blocks.items[--s->open_blocks.size];
 236}
 237
 238// Write the whole state of a Scanner to a byte buffer
 239static unsigned serialize(Scanner *s, char *buffer) {
 240    unsigned size = 0;
 241    buffer[size++] = (char)s->state;
 242    buffer[size++] = (char)s->matched;
 243    buffer[size++] = (char)s->indentation;
 244    buffer[size++] = (char)s->column;
 245    buffer[size++] = (char)s->fenced_code_block_delimiter_length;
 246    size_t blocks_count = s->open_blocks.size;
 247    if (blocks_count > 0) {
 248        memcpy(&buffer[size], s->open_blocks.items,
 249               blocks_count * sizeof(Block));
 250        size += blocks_count * sizeof(Block);
 251    }
 252    return size;
 253}
 254
 255// Read the whole state of a Scanner from a byte buffer
 256// `serizalize` and `deserialize` should be fully symmetric.
 257static void deserialize(Scanner *s, const char *buffer, unsigned length) {
 258    s->open_blocks.size = 0;
 259    s->open_blocks.capacity = 0;
 260    s->state = 0;
 261    s->matched = 0;
 262    s->indentation = 0;
 263    s->column = 0;
 264    s->fenced_code_block_delimiter_length = 0;
 265    if (length > 0) {
 266        size_t size = 0;
 267        s->state = (uint8_t)buffer[size++];
 268        s->matched = (uint8_t)buffer[size++];
 269        s->indentation = (uint8_t)buffer[size++];
 270        s->column = (uint8_t)buffer[size++];
 271        s->fenced_code_block_delimiter_length = (uint8_t)buffer[size++];
 272        size_t blocks_size = length - size;
 273        if (blocks_size > 0) {
 274            size_t blocks_count = blocks_size / sizeof(Block);
 275
 276            // ensure open blocks has enough room
 277            if (s->open_blocks.capacity < blocks_count) {
 278              size_t capacity = roundup_32(blocks_count);
 279              void *tmp = realloc(s->open_blocks.items,
 280                            sizeof(Block) * capacity);
 281              assert(tmp != NULL);
 282              s->open_blocks.items = tmp;
 283              s->open_blocks.capacity = capacity;
 284            }
 285            memcpy(s->open_blocks.items, &buffer[size], blocks_size);
 286            s->open_blocks.size = blocks_count;
 287        }
 288    }
 289}
 290
 291static void mark_end(Scanner *s, TSLexer *lexer) {
 292    if (!s->simulate) {
 293        lexer->mark_end(lexer);
 294    }
 295}
 296
 297// Convenience function to emit the error token. This is done to stop invalid
 298// parse branches. Specifically:
 299// 1. When encountering a newline after a line break that ended a paragraph, and
 300// no new block
 301//    has been opened.
 302// 2. When encountering a new block after a soft line break.
 303// 3. When a `$._trigger_error` token is valid, which is used to stop parse
 304// branches through
 305//    normal tree-sitter grammar rules.
 306//
 307// See also the `$._soft_line_break` and `$._paragraph_end_newline` tokens in
 308// grammar.js
 309static bool error(TSLexer *lexer) {
 310    lexer->result_symbol = ERROR;
 311    return true;
 312}
 313
 314// Advance the lexer one character
 315// Also keeps track of the current column, counting tabs as spaces with tab stop
 316// 4 See https://github.github.com/gfm/#tabs
 317static size_t advance(Scanner *s, TSLexer *lexer) {
 318    size_t size = 1;
 319    if (lexer->lookahead == '\t') {
 320        size = 4 - s->column;
 321        s->column = 0;
 322    } else {
 323        s->column = (s->column + 1) % 4;
 324    }
 325    lexer->advance(lexer, false);
 326    return size;
 327}
 328
 329// Try to match the given block, i.e. consume all tokens that belong to the
 330// block. These are
 331// 1. indentation for list items and indented code blocks
 332// 2. '>' for block quotes
 333// Returns true if the block is matched and false otherwise
 334static bool match(Scanner *s, TSLexer *lexer, Block block) {
 335    switch (block) {
 336        case INDENTED_CODE_BLOCK:
 337            while (s->indentation < 4) {
 338                if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 339                    s->indentation += advance(s, lexer);
 340                } else {
 341                    break;
 342                }
 343            }
 344            if (s->indentation >= 4 && lexer->lookahead != '\n' &&
 345                lexer->lookahead != '\r') {
 346                s->indentation -= 4;
 347                return true;
 348            }
 349            break;
 350        case LIST_ITEM:
 351        case LIST_ITEM_1_INDENTATION:
 352        case LIST_ITEM_2_INDENTATION:
 353        case LIST_ITEM_3_INDENTATION:
 354        case LIST_ITEM_4_INDENTATION:
 355        case LIST_ITEM_5_INDENTATION:
 356        case LIST_ITEM_6_INDENTATION:
 357        case LIST_ITEM_7_INDENTATION:
 358        case LIST_ITEM_8_INDENTATION:
 359        case LIST_ITEM_9_INDENTATION:
 360        case LIST_ITEM_10_INDENTATION:
 361        case LIST_ITEM_11_INDENTATION:
 362        case LIST_ITEM_12_INDENTATION:
 363        case LIST_ITEM_13_INDENTATION:
 364        case LIST_ITEM_14_INDENTATION:
 365        case LIST_ITEM_MAX_INDENTATION:
 366            while (s->indentation < list_item_indentation(block)) {
 367                if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 368                    s->indentation += advance(s, lexer);
 369                } else {
 370                    break;
 371                }
 372            }
 373            if (s->indentation >= list_item_indentation(block)) {
 374                s->indentation -= list_item_indentation(block);
 375                return true;
 376            }
 377            if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
 378                s->indentation = 0;
 379                return true;
 380            }
 381            break;
 382        case BLOCK_QUOTE:
 383            while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 384                s->indentation += advance(s, lexer);
 385            }
 386            if (lexer->lookahead == '>') {
 387                advance(s, lexer);
 388                s->indentation = 0;
 389                if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 390                    s->indentation += advance(s, lexer) - 1;
 391                }
 392                return true;
 393            }
 394            break;
 395        case FENCED_CODE_BLOCK:
 396        case ANONYMOUS:
 397            return true;
 398    }
 399    return false;
 400}
 401
 402static bool parse_fenced_code_block(Scanner *s, const char delimiter,
 403                                    TSLexer *lexer, const bool *valid_symbols) {
 404    // count the number of backticks
 405    uint8_t level = 0;
 406    while (lexer->lookahead == delimiter) {
 407        advance(s, lexer);
 408        level++;
 409    }
 410    mark_end(s, lexer);
 411    // If this is able to close a fenced code block then that is the only valid
 412    // interpretation. It can only close a fenced code block if the number of
 413    // backticks is at least the number of backticks of the opening delimiter.
 414    // Also it cannot be indented more than 3 spaces.
 415    if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_END_BACKTICK]
 416                          : valid_symbols[FENCED_CODE_BLOCK_END_TILDE]) &&
 417        s->indentation < 4 && level >= s->fenced_code_block_delimiter_length &&
 418        (lexer->lookahead == '\n' || lexer->lookahead == '\r')) {
 419        s->fenced_code_block_delimiter_length = 0;
 420        lexer->result_symbol = delimiter == '`' ? FENCED_CODE_BLOCK_END_BACKTICK
 421                                                : FENCED_CODE_BLOCK_END_TILDE;
 422        return true;
 423    }
 424    // If this could be the start of a fenced code block, check if the info
 425    // string contains any backticks.
 426    if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_START_BACKTICK]
 427                          : valid_symbols[FENCED_CODE_BLOCK_START_TILDE]) &&
 428        level >= 3) {
 429        bool info_string_has_backtick = false;
 430        if (delimiter == '`') {
 431            while (lexer->lookahead != '\n' && lexer->lookahead != '\r' &&
 432                   !lexer->eof(lexer)) {
 433                if (lexer->lookahead == '`') {
 434                    info_string_has_backtick = true;
 435                    break;
 436                }
 437                advance(s, lexer);
 438            }
 439        }
 440        // If it does not then choose to interpret this as the start of a fenced
 441        // code block.
 442        if (!info_string_has_backtick) {
 443            lexer->result_symbol = delimiter == '`'
 444                                       ? FENCED_CODE_BLOCK_START_BACKTICK
 445                                       : FENCED_CODE_BLOCK_START_TILDE;
 446            if (!s->simulate)
 447                push_block(s, FENCED_CODE_BLOCK);
 448            // Remember the length of the delimiter for later, since we need it
 449            // to decide whether a sequence of backticks can close the block.
 450            s->fenced_code_block_delimiter_length = level;
 451            s->indentation = 0;
 452            return true;
 453        }
 454    }
 455    return false;
 456}
 457
 458static bool parse_star(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
 459    advance(s, lexer);
 460    mark_end(s, lexer);
 461    // Otherwise count the number of stars permitting whitespaces between them.
 462    size_t star_count = 1;
 463    // Also remember how many stars there are before the first whitespace...
 464    // ...and how many spaces follow the first star.
 465    uint8_t extra_indentation = 0;
 466    for (;;) {
 467        if (lexer->lookahead == '*') {
 468            if (star_count == 1 && extra_indentation >= 1 &&
 469                valid_symbols[LIST_MARKER_STAR]) {
 470                // If we get to this point then the token has to be at least
 471                // this long. We need to call `mark_end` here in case we decide
 472                // later that this is a list item.
 473                mark_end(s, lexer);
 474            }
 475            star_count++;
 476            advance(s, lexer);
 477        } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 478            if (star_count == 1) {
 479                extra_indentation += advance(s, lexer);
 480            } else {
 481                advance(s, lexer);
 482            }
 483        } else {
 484            break;
 485        }
 486    }
 487    bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r';
 488    bool dont_interrupt = false;
 489    if (star_count == 1 && line_end) {
 490        extra_indentation = 1;
 491        // line is empty so don't interrupt paragraphs if this is a list marker
 492        dont_interrupt = s->matched == s->open_blocks.size;
 493    }
 494    // If there were at least 3 stars then this could be a thematic break
 495    bool thematic_break = star_count >= 3 && line_end;
 496    // If there was a star and at least one space after that star then this
 497    // could be a list marker.
 498    bool list_marker_star = star_count >= 1 && extra_indentation >= 1;
 499    if (valid_symbols[THEMATIC_BREAK] && thematic_break && s->indentation < 4) {
 500        // If a thematic break is valid then it takes precedence
 501        lexer->result_symbol = THEMATIC_BREAK;
 502        mark_end(s, lexer);
 503        s->indentation = 0;
 504        return true;
 505    }
 506    if ((dont_interrupt ? valid_symbols[LIST_MARKER_STAR_DONT_INTERRUPT]
 507                        : valid_symbols[LIST_MARKER_STAR]) &&
 508        list_marker_star) {
 509        // List markers take precedence over emphasis markers
 510        // If star_count > 1 then we already called mark_end at the right point.
 511        // Otherwise the token should go until this point.
 512        if (star_count == 1) {
 513            mark_end(s, lexer);
 514        }
 515        // Not counting one space...
 516        extra_indentation--;
 517        // ... check if the list item begins with an indented code block
 518        if (extra_indentation <= 3) {
 519            // If not then calculate the indentation level of the list item
 520            // content as indentation of list marker + indentation after list
 521            // marker - 1
 522            extra_indentation += s->indentation;
 523            s->indentation = 0;
 524        } else {
 525            // Otherwise the indentation level is just the indentation of the
 526            // list marker. We keep the indentation after the list marker for
 527            // later blocks.
 528            uint8_t temp = s->indentation;
 529            s->indentation = extra_indentation;
 530            extra_indentation = temp;
 531        }
 532        if (!s->simulate)
 533            push_block(s, (Block)(LIST_ITEM + extra_indentation));
 534        lexer->result_symbol =
 535            dont_interrupt ? LIST_MARKER_STAR_DONT_INTERRUPT : LIST_MARKER_STAR;
 536        return true;
 537    }
 538    return false;
 539}
 540
 541static bool parse_thematic_break_underscore(Scanner *s, TSLexer *lexer,
 542                                            const bool *valid_symbols) {
 543    advance(s, lexer);
 544    mark_end(s, lexer);
 545    size_t underscore_count = 1;
 546    for (;;) {
 547        if (lexer->lookahead == '_') {
 548            underscore_count++;
 549            advance(s, lexer);
 550        } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 551            advance(s, lexer);
 552        } else {
 553            break;
 554        }
 555    }
 556    bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r';
 557    if (underscore_count >= 3 && line_end && valid_symbols[THEMATIC_BREAK]) {
 558        lexer->result_symbol = THEMATIC_BREAK;
 559        mark_end(s, lexer);
 560        s->indentation = 0;
 561        return true;
 562    }
 563    return false;
 564}
 565
 566static bool parse_block_quote(Scanner *s, TSLexer *lexer,
 567                              const bool *valid_symbols) {
 568    if (valid_symbols[BLOCK_QUOTE_START]) {
 569        advance(s, lexer);
 570        s->indentation = 0;
 571        if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 572            s->indentation += advance(s, lexer) - 1;
 573        }
 574        lexer->result_symbol = BLOCK_QUOTE_START;
 575        if (!s->simulate)
 576            push_block(s, BLOCK_QUOTE);
 577        return true;
 578    }
 579    return false;
 580}
 581
 582static bool parse_atx_heading(Scanner *s, TSLexer *lexer,
 583                              const bool *valid_symbols) {
 584    if (valid_symbols[ATX_H1_MARKER] && s->indentation <= 3) {
 585        mark_end(s, lexer);
 586        uint16_t level = 0;
 587        while (lexer->lookahead == '#' && level <= 6) {
 588            advance(s, lexer);
 589            level++;
 590        }
 591        if (level <= 6 &&
 592            (lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
 593             lexer->lookahead == '\n' || lexer->lookahead == '\r')) {
 594            lexer->result_symbol = ATX_H1_MARKER + (level - 1);
 595            s->indentation = 0;
 596            mark_end(s, lexer);
 597            return true;
 598        }
 599    }
 600    return false;
 601}
 602
 603static bool parse_setext_underline(Scanner *s, TSLexer *lexer,
 604                                   const bool *valid_symbols) {
 605    if (valid_symbols[SETEXT_H1_UNDERLINE] &&
 606        s->matched == s->open_blocks.size) {
 607        mark_end(s, lexer);
 608        while (lexer->lookahead == '=') {
 609            advance(s, lexer);
 610        }
 611        while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 612            advance(s, lexer);
 613        }
 614        if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
 615            lexer->result_symbol = SETEXT_H1_UNDERLINE;
 616            mark_end(s, lexer);
 617            return true;
 618        }
 619    }
 620    return false;
 621}
 622
 623static bool parse_plus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
 624    if (s->indentation <= 3 &&
 625        (valid_symbols[LIST_MARKER_PLUS] ||
 626         valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT] ||
 627         valid_symbols[PLUS_METADATA])) {
 628        advance(s, lexer);
 629        if (valid_symbols[PLUS_METADATA] && lexer->lookahead == '+') {
 630            advance(s, lexer);
 631            if (lexer->lookahead != '+') {
 632                return false;
 633            }
 634            advance(s, lexer);
 635            while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 636                advance(s, lexer);
 637            }
 638            if (lexer->lookahead != '\n' && lexer->lookahead != '\r') {
 639                return false;
 640            }
 641            for (;;) {
 642                // advance over newline
 643                if (lexer->lookahead == '\r') {
 644                    advance(s, lexer);
 645                    if (lexer->lookahead == '\n') {
 646                        advance(s, lexer);
 647                    }
 648                } else {
 649                    advance(s, lexer);
 650                }
 651                // check for pluses
 652                size_t plus_count = 0;
 653                while (lexer->lookahead == '+') {
 654                    plus_count++;
 655                    advance(s, lexer);
 656                }
 657                if (plus_count == 3) {
 658                    // if exactly 3 check if next symbol (after eventual
 659                    // whitespace) is newline
 660                    while (lexer->lookahead == ' ' ||
 661                           lexer->lookahead == '\t') {
 662                        advance(s, lexer);
 663                    }
 664                    if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
 665                        // if so also consume newline
 666                        if (lexer->lookahead == '\r') {
 667                            advance(s, lexer);
 668                            if (lexer->lookahead == '\n') {
 669                                advance(s, lexer);
 670                            }
 671                        } else {
 672                            advance(s, lexer);
 673                        }
 674                        mark_end(s, lexer);
 675                        lexer->result_symbol = PLUS_METADATA;
 676                        return true;
 677                    }
 678                }
 679                // otherwise consume rest of line
 680                while (lexer->lookahead != '\n' && lexer->lookahead != '\r' &&
 681                       !lexer->eof(lexer)) {
 682                    advance(s, lexer);
 683                }
 684                // if end of file is reached, then this is not metadata
 685                if (lexer->eof(lexer)) {
 686                    break;
 687                }
 688            }
 689        } else {
 690            uint8_t extra_indentation = 0;
 691            while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 692                extra_indentation += advance(s, lexer);
 693            }
 694            bool dont_interrupt = false;
 695            if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
 696                extra_indentation = 1;
 697                dont_interrupt = true;
 698            }
 699            dont_interrupt =
 700                dont_interrupt && s->matched == s->open_blocks.size;
 701            if (extra_indentation >= 1 &&
 702                (dont_interrupt ? valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT]
 703                                : valid_symbols[LIST_MARKER_PLUS])) {
 704                lexer->result_symbol = dont_interrupt
 705                                           ? LIST_MARKER_PLUS_DONT_INTERRUPT
 706                                           : LIST_MARKER_PLUS;
 707                extra_indentation--;
 708                if (extra_indentation <= 3) {
 709                    extra_indentation += s->indentation;
 710                    s->indentation = 0;
 711                } else {
 712                    uint8_t temp = s->indentation;
 713                    s->indentation = extra_indentation;
 714                    extra_indentation = temp;
 715                }
 716                if (!s->simulate)
 717                    push_block(s, (Block)(LIST_ITEM + extra_indentation));
 718                return true;
 719            }
 720        }
 721    }
 722    return false;
 723}
 724
 725static bool parse_ordered_list_marker(Scanner *s, TSLexer *lexer,
 726                                      const bool *valid_symbols) {
 727    if (s->indentation <= 3 &&
 728        (valid_symbols[LIST_MARKER_PARENTHESIS] ||
 729         valid_symbols[LIST_MARKER_DOT] ||
 730         valid_symbols[LIST_MARKER_PARENTHESIS_DONT_INTERRUPT] ||
 731         valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT])) {
 732        size_t digits = 1;
 733        bool dont_interrupt = lexer->lookahead != '1';
 734        advance(s, lexer);
 735        while (isdigit(lexer->lookahead)) {
 736            dont_interrupt = true;
 737            digits++;
 738            advance(s, lexer);
 739        }
 740        if (digits >= 1 && digits <= 9) {
 741            bool dot = false;
 742            bool parenthesis = false;
 743            if (lexer->lookahead == '.') {
 744                advance(s, lexer);
 745                dot = true;
 746            } else if (lexer->lookahead == ')') {
 747                advance(s, lexer);
 748                parenthesis = true;
 749            }
 750            if (dot || parenthesis) {
 751                uint8_t extra_indentation = 0;
 752                while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 753                    extra_indentation += advance(s, lexer);
 754                }
 755                bool line_end =
 756                    lexer->lookahead == '\n' || lexer->lookahead == '\r';
 757                if (line_end) {
 758                    extra_indentation = 1;
 759                    dont_interrupt = true;
 760                }
 761                dont_interrupt =
 762                    dont_interrupt && s->matched == s->open_blocks.size;
 763                if (extra_indentation >= 1 &&
 764                    (dot ? (dont_interrupt
 765                                ? valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT]
 766                                : valid_symbols[LIST_MARKER_DOT])
 767                         : (dont_interrupt
 768                                ? valid_symbols
 769                                      [LIST_MARKER_PARENTHESIS_DONT_INTERRUPT]
 770                                : valid_symbols[LIST_MARKER_PARENTHESIS]))) {
 771                    lexer->result_symbol =
 772                        dot ? LIST_MARKER_DOT : LIST_MARKER_PARENTHESIS;
 773                    extra_indentation--;
 774                    if (extra_indentation <= 3) {
 775                        extra_indentation += s->indentation;
 776                        s->indentation = 0;
 777                    } else {
 778                        uint8_t temp = s->indentation;
 779                        s->indentation = extra_indentation;
 780                        extra_indentation = temp;
 781                    }
 782                    if (!s->simulate)
 783                        push_block(
 784                            s, (Block)(LIST_ITEM + extra_indentation + digits));
 785                    return true;
 786                }
 787            }
 788        }
 789    }
 790    return false;
 791}
 792
 793static bool parse_minus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
 794    if (s->indentation <= 3 &&
 795        (valid_symbols[LIST_MARKER_MINUS] ||
 796         valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT] ||
 797         valid_symbols[SETEXT_H2_UNDERLINE] || valid_symbols[THEMATIC_BREAK] ||
 798         valid_symbols[MINUS_METADATA])) {
 799        mark_end(s, lexer);
 800        bool whitespace_after_minus = false;
 801        bool minus_after_whitespace = false;
 802        size_t minus_count = 0;
 803        uint8_t extra_indentation = 0;
 804
 805        for (;;) {
 806            if (lexer->lookahead == '-') {
 807                if (minus_count == 1 && extra_indentation >= 1) {
 808                    mark_end(s, lexer);
 809                }
 810                minus_count++;
 811                advance(s, lexer);
 812                minus_after_whitespace = whitespace_after_minus;
 813            } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
 814                if (minus_count == 1) {
 815                    extra_indentation += advance(s, lexer);
 816                } else {
 817                    advance(s, lexer);
 818                }
 819                whitespace_after_minus = true;
 820            } else {
 821                break;
 822            }
 823        }
 824        bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r';
 825        bool dont_interrupt = false;
 826        if (minus_count == 1 && line_end) {
 827            extra_indentation = 1;
 828            dont_interrupt = true;
 829        }
 830        dont_interrupt = dont_interrupt && s->matched == s->open_blocks.size;
 831        bool thematic_break = minus_count >= 3 && line_end;
 832        bool underline =
 833            minus_count >= 1 && !minus_after_whitespace && line_end &&
 834            s->matched ==
 835                s->open_blocks
 836                    .size; // setext heading can not break lazy continuation
 837        bool list_marker_minus = minus_count >= 1 && extra_indentation >= 1;
 838        bool success = false;
 839        if (valid_symbols[SETEXT_H2_UNDERLINE] && underline) {
 840            lexer->result_symbol = SETEXT_H2_UNDERLINE;
 841            mark_end(s, lexer);
 842            s->indentation = 0;
 843            success = true;
 844        } else if (valid_symbols[THEMATIC_BREAK] &&
 845                   thematic_break) { // underline is false if list_marker_minus
 846                                     // is true
 847            lexer->result_symbol = THEMATIC_BREAK;
 848            mark_end(s, lexer);
 849            s->indentation = 0;
 850            success = true;
 851        } else if ((dont_interrupt
 852                        ? valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT]
 853                        : valid_symbols[LIST_MARKER_MINUS]) &&
 854                   list_marker_minus) {
 855            if (minus_count == 1) {
 856                mark_end(s, lexer);
 857            }
 858            extra_indentation--;
 859            if (extra_indentation <= 3) {
 860                extra_indentation += s->indentation;
 861                s->indentation = 0;
 862            } else {
 863                uint8_t temp = s->indentation;
 864                s->indentation = extra_indentation;
 865                extra_indentation = temp;
 866            }
 867            if (!s->simulate)
 868                push_block(s, (Block)(LIST_ITEM + extra_indentation));
 869            lexer->result_symbol = dont_interrupt
 870                                       ? LIST_MARKER_MINUS_DONT_INTERRUPT
 871                                       : LIST_MARKER_MINUS;
 872            return true;
 873        }
 874        if (minus_count == 3 && (!minus_after_whitespace) && line_end &&
 875            valid_symbols[MINUS_METADATA]) {
 876            for (;;) {
 877                // advance over newline
 878                if (lexer->lookahead == '\r') {
 879                    advance(s, lexer);
 880                    if (lexer->lookahead == '\n') {
 881                        advance(s, lexer);
 882                    }
 883                } else {
 884                    advance(s, lexer);
 885                }
 886                // check for minuses
 887                minus_count = 0;
 888                while (lexer->lookahead == '-') {
 889                    minus_count++;
 890                    advance(s, lexer);
 891                }
 892                if (minus_count == 3) {
 893                    // if exactly 3 check if next symbol (after eventual
 894                    // whitespace) is newline
 895                    while (lexer->lookahead == ' ' ||
 896                           lexer->lookahead == '\t') {
 897                        advance(s, lexer);
 898                    }
 899                    if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
 900                        // if so also consume newline
 901                        if (lexer->lookahead == '\r') {
 902                            advance(s, lexer);
 903                            if (lexer->lookahead == '\n') {
 904                                advance(s, lexer);
 905                            }
 906                        } else {
 907                            advance(s, lexer);
 908                        }
 909                        mark_end(s, lexer);
 910                        lexer->result_symbol = MINUS_METADATA;
 911                        return true;
 912                    }
 913                }
 914                // otherwise consume rest of line
 915                while (lexer->lookahead != '\n' && lexer->lookahead != '\r' &&
 916                       !lexer->eof(lexer)) {
 917                    advance(s, lexer);
 918                }
 919                // if end of file is reached, then this is not metadata
 920                if (lexer->eof(lexer)) {
 921                    break;
 922                }
 923            }
 924        }
 925        if (success) {
 926            return true;
 927        }
 928    }
 929    return false;
 930}
 931
 932static bool parse_html_block(Scanner *s, TSLexer *lexer,
 933                             const bool *valid_symbols) {
 934    if (!(valid_symbols[HTML_BLOCK_1_START] ||
 935          valid_symbols[HTML_BLOCK_1_END] ||
 936          valid_symbols[HTML_BLOCK_2_START] ||
 937          valid_symbols[HTML_BLOCK_3_START] ||
 938          valid_symbols[HTML_BLOCK_4_START] ||
 939          valid_symbols[HTML_BLOCK_5_START] ||
 940          valid_symbols[HTML_BLOCK_6_START] ||
 941          valid_symbols[HTML_BLOCK_7_START])) {
 942        return false;
 943    }
 944    advance(s, lexer);
 945    if (lexer->lookahead == '?' && valid_symbols[HTML_BLOCK_3_START]) {
 946        advance(s, lexer);
 947        lexer->result_symbol = HTML_BLOCK_3_START;
 948        if (!s->simulate)
 949            push_block(s, ANONYMOUS);
 950        return true;
 951    }
 952    if (lexer->lookahead == '!') {
 953        // could be block 2
 954        advance(s, lexer);
 955        if (lexer->lookahead == '-') {
 956            advance(s, lexer);
 957            if (lexer->lookahead == '-' && valid_symbols[HTML_BLOCK_2_START]) {
 958                advance(s, lexer);
 959                lexer->result_symbol = HTML_BLOCK_2_START;
 960                if (!s->simulate)
 961                    push_block(s, ANONYMOUS);
 962                return true;
 963            }
 964        } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z' &&
 965                   valid_symbols[HTML_BLOCK_4_START]) {
 966            advance(s, lexer);
 967            lexer->result_symbol = HTML_BLOCK_4_START;
 968            if (!s->simulate)
 969                push_block(s, ANONYMOUS);
 970            return true;
 971        } else if (lexer->lookahead == '[') {
 972            advance(s, lexer);
 973            if (lexer->lookahead == 'C') {
 974                advance(s, lexer);
 975                if (lexer->lookahead == 'D') {
 976                    advance(s, lexer);
 977                    if (lexer->lookahead == 'A') {
 978                        advance(s, lexer);
 979                        if (lexer->lookahead == 'T') {
 980                            advance(s, lexer);
 981                            if (lexer->lookahead == 'A') {
 982                                advance(s, lexer);
 983                                if (lexer->lookahead == '[' &&
 984                                    valid_symbols[HTML_BLOCK_5_START]) {
 985                                    advance(s, lexer);
 986                                    lexer->result_symbol = HTML_BLOCK_5_START;
 987                                    if (!s->simulate)
 988                                        push_block(s, ANONYMOUS);
 989                                    return true;
 990                                }
 991                            }
 992                        }
 993                    }
 994                }
 995            }
 996        }
 997    }
 998    bool starting_slash = lexer->lookahead == '/';
 999    if (starting_slash) {
1000        advance(s, lexer);
1001    }
1002    char name[11];
1003    size_t name_length = 0;
1004    while (iswalpha((wint_t)lexer->lookahead)) {
1005        if (name_length < 10) {
1006            name[name_length++] = (char)towlower((wint_t)lexer->lookahead);
1007        } else {
1008            name_length = 12;
1009        }
1010        advance(s, lexer);
1011    }
1012    if (name_length == 0) {
1013        return false;
1014    }
1015    bool tag_closed = false;
1016    if (name_length < 11) {
1017        name[name_length] = 0;
1018        bool next_symbol_valid =
1019            lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
1020            lexer->lookahead == '\n' || lexer->lookahead == '\r' ||
1021            lexer->lookahead == '>';
1022        if (next_symbol_valid) {
1023            // try block 1 names
1024            for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_1; i++) {
1025                if (strcmp(name, HTML_TAG_NAMES_RULE_1[i]) == 0) {
1026                    if (starting_slash) {
1027                        if (valid_symbols[HTML_BLOCK_1_END]) {
1028                            lexer->result_symbol = HTML_BLOCK_1_END;
1029                            return true;
1030                        }
1031                    } else if (valid_symbols[HTML_BLOCK_1_START]) {
1032                        lexer->result_symbol = HTML_BLOCK_1_START;
1033                        if (!s->simulate)
1034                            push_block(s, ANONYMOUS);
1035                        return true;
1036                    }
1037                }
1038            }
1039        }
1040        if (!next_symbol_valid && lexer->lookahead == '/') {
1041            advance(s, lexer);
1042            if (lexer->lookahead == '>') {
1043                advance(s, lexer);
1044                tag_closed = true;
1045            }
1046        }
1047        if (next_symbol_valid || tag_closed) {
1048            // try block 2 names
1049            for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_7; i++) {
1050                if (strcmp(name, HTML_TAG_NAMES_RULE_7[i]) == 0 &&
1051                    valid_symbols[HTML_BLOCK_6_START]) {
1052                    lexer->result_symbol = HTML_BLOCK_6_START;
1053                    if (!s->simulate)
1054                        push_block(s, ANONYMOUS);
1055                    return true;
1056                }
1057            }
1058        }
1059    }
1060
1061    if (!valid_symbols[HTML_BLOCK_7_START]) {
1062        return false;
1063    }
1064
1065    if (!tag_closed) {
1066        // tag name (continued)
1067        while (iswalnum((wint_t)lexer->lookahead) || lexer->lookahead == '-') {
1068            advance(s, lexer);
1069        }
1070        if (!starting_slash) {
1071            // attributes
1072            bool had_whitespace = false;
1073            for (;;) {
1074                // whitespace
1075                while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1076                    had_whitespace = true;
1077                    advance(s, lexer);
1078                }
1079                if (lexer->lookahead == '/') {
1080                    advance(s, lexer);
1081                    break;
1082                }
1083                if (lexer->lookahead == '>') {
1084                    break;
1085                }
1086                // attribute name
1087                if (!had_whitespace) {
1088                    return false;
1089                }
1090                if (!iswalpha((wint_t)lexer->lookahead) &&
1091                    lexer->lookahead != '_' && lexer->lookahead != ':') {
1092                    return false;
1093                }
1094                had_whitespace = false;
1095                advance(s, lexer);
1096                while (iswalnum((wint_t)lexer->lookahead) ||
1097                       lexer->lookahead == '_' || lexer->lookahead == '.' ||
1098                       lexer->lookahead == ':' || lexer->lookahead == '-') {
1099                    advance(s, lexer);
1100                }
1101                // attribute value specification
1102                // optional whitespace
1103                while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1104                    had_whitespace = true;
1105                    advance(s, lexer);
1106                }
1107                // =
1108                if (lexer->lookahead == '=') {
1109                    advance(s, lexer);
1110                    had_whitespace = false;
1111                    // optional whitespace
1112                    while (lexer->lookahead == ' ' ||
1113                           lexer->lookahead == '\t') {
1114                        advance(s, lexer);
1115                    }
1116                    // attribute value
1117                    if (lexer->lookahead == '\'' || lexer->lookahead == '"') {
1118                        char delimiter = (char)lexer->lookahead;
1119                        advance(s, lexer);
1120                        while (lexer->lookahead != delimiter &&
1121                               lexer->lookahead != '\n' &&
1122                               lexer->lookahead != '\r' && !lexer->eof(lexer)) {
1123                            advance(s, lexer);
1124                        }
1125                        if (lexer->lookahead != delimiter) {
1126                            return false;
1127                        }
1128                        advance(s, lexer);
1129                    } else {
1130                        // unquoted attribute value
1131                        bool had_one = false;
1132                        while (lexer->lookahead != ' ' &&
1133                               lexer->lookahead != '\t' &&
1134                               lexer->lookahead != '"' &&
1135                               lexer->lookahead != '\'' &&
1136                               lexer->lookahead != '=' &&
1137                               lexer->lookahead != '<' &&
1138                               lexer->lookahead != '>' &&
1139                               lexer->lookahead != '`' &&
1140                               lexer->lookahead != '\n' &&
1141                               lexer->lookahead != '\r' && !lexer->eof(lexer)) {
1142                            advance(s, lexer);
1143                            had_one = true;
1144                        }
1145                        if (!had_one) {
1146                            return false;
1147                        }
1148                    }
1149                }
1150            }
1151        } else {
1152            while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1153                advance(s, lexer);
1154            }
1155        }
1156        if (lexer->lookahead != '>') {
1157            return false;
1158        }
1159        advance(s, lexer);
1160    }
1161    while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1162        advance(s, lexer);
1163    }
1164    if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
1165        lexer->result_symbol = HTML_BLOCK_7_START;
1166        if (!s->simulate)
1167            push_block(s, ANONYMOUS);
1168        return true;
1169    }
1170    return false;
1171}
1172
1173static bool parse_pipe_table(Scanner *s, TSLexer *lexer,
1174                             const bool *valid_symbols) {
1175
1176    // unused
1177    (void)(valid_symbols);
1178
1179    // PIPE_TABLE_START is zero width
1180    mark_end(s, lexer);
1181    // count number of cells
1182    size_t cell_count = 0;
1183    // also remember if we see starting and ending pipes, as empty headers have
1184    // to have both
1185    bool starting_pipe = false;
1186    bool ending_pipe = false;
1187    bool empty = true;
1188    if (lexer->lookahead == '|') {
1189        starting_pipe = true;
1190        advance(s, lexer);
1191    }
1192    while (lexer->lookahead != '\r' && lexer->lookahead != '\n' &&
1193           !lexer->eof(lexer)) {
1194        if (lexer->lookahead == '|') {
1195            cell_count++;
1196            ending_pipe = true;
1197            advance(s, lexer);
1198        } else {
1199            if (lexer->lookahead != ' ' && lexer->lookahead != '\t') {
1200                ending_pipe = false;
1201            }
1202            if (lexer->lookahead == '\\') {
1203                advance(s, lexer);
1204                if (is_punctuation((char)lexer->lookahead)) {
1205                    advance(s, lexer);
1206                }
1207            } else {
1208                advance(s, lexer);
1209            }
1210        }
1211    }
1212    if (empty && cell_count == 0 && !(starting_pipe && ending_pipe)) {
1213        return false;
1214    }
1215    if (!ending_pipe) {
1216        cell_count++;
1217    }
1218
1219    // check the following line for a delimiter row
1220    // parse a newline
1221    if (lexer->lookahead == '\n') {
1222        advance(s, lexer);
1223    } else if (lexer->lookahead == '\r') {
1224        advance(s, lexer);
1225        if (lexer->lookahead == '\n') {
1226            advance(s, lexer);
1227        }
1228    } else {
1229        return false;
1230    }
1231    s->indentation = 0;
1232    s->column = 0;
1233    for (;;) {
1234        if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1235            s->indentation += advance(s, lexer);
1236        } else {
1237            break;
1238        }
1239    }
1240    s->simulate = true;
1241    uint8_t matched_temp = 0;
1242    while (matched_temp < (uint8_t)s->open_blocks.size) {
1243        if (match(s, lexer, s->open_blocks.items[matched_temp])) {
1244            matched_temp++;
1245        } else {
1246            return false;
1247        }
1248    }
1249
1250    // check if delimiter row has the same number of cells and at least one pipe
1251    size_t delimiter_cell_count = 0;
1252    if (lexer->lookahead == '|') {
1253        advance(s, lexer);
1254    }
1255    for (;;) {
1256        while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1257            advance(s, lexer);
1258        }
1259        if (lexer->lookahead == '|') {
1260            delimiter_cell_count++;
1261            advance(s, lexer);
1262            continue;
1263        }
1264        if (lexer->lookahead == ':') {
1265            advance(s, lexer);
1266            if (lexer->lookahead != '-') {
1267                return false;
1268            }
1269        }
1270        bool had_one_minus = false;
1271        while (lexer->lookahead == '-') {
1272            had_one_minus = true;
1273            advance(s, lexer);
1274        }
1275        if (had_one_minus) {
1276            delimiter_cell_count++;
1277        }
1278        if (lexer->lookahead == ':') {
1279            if (!had_one_minus) {
1280                return false;
1281            }
1282            advance(s, lexer);
1283        }
1284        while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1285            advance(s, lexer);
1286        }
1287        if (lexer->lookahead == '|') {
1288            if (!had_one_minus) {
1289                delimiter_cell_count++;
1290            }
1291            advance(s, lexer);
1292            continue;
1293        }
1294        if (lexer->lookahead != '\r' && lexer->lookahead != '\n') {
1295            return false;
1296        } else {
1297            break;
1298        }
1299    }
1300    // if the cell counts are not equal then this is not a table
1301    if (cell_count != delimiter_cell_count) {
1302        return false;
1303    }
1304
1305    lexer->result_symbol = PIPE_TABLE_START;
1306    return true;
1307}
1308
1309static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
1310    // A normal tree-sitter rule decided that the current branch is invalid and
1311    // now "requests" an error to stop the branch
1312    if (valid_symbols[TRIGGER_ERROR]) {
1313        return error(lexer);
1314    }
1315
1316    // Close the inner most block after the next line break as requested. See
1317    // `$._close_block` in grammar.js
1318    if (valid_symbols[CLOSE_BLOCK]) {
1319        s->state |= STATE_CLOSE_BLOCK;
1320        lexer->result_symbol = CLOSE_BLOCK;
1321        return true;
1322    }
1323
1324    // if we are at the end of the file and there are still open blocks close
1325    // them all
1326    if (lexer->eof(lexer)) {
1327        if (valid_symbols[TOKEN_EOF]) {
1328            lexer->result_symbol = TOKEN_EOF;
1329            return true;
1330        }
1331        if (s->open_blocks.size > 0) {
1332            lexer->result_symbol = BLOCK_CLOSE;
1333            if (!s->simulate)
1334                pop_block(s);
1335            return true;
1336        }
1337        return false;
1338    }
1339
1340    if (!(s->state & STATE_MATCHING)) {
1341        // Parse any preceeding whitespace and remember its length. This makes a
1342        // lot of parsing quite a bit easier.
1343        for (;;) {
1344            if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1345                s->indentation += advance(s, lexer);
1346            } else {
1347                break;
1348            }
1349        }
1350        // We are not matching. This is where the parsing logic for most
1351        // "normal" token is. Most importantly parsing logic for the start of
1352        // new blocks.
1353        if (valid_symbols[INDENTED_CHUNK_START] &&
1354            !valid_symbols[NO_INDENTED_CHUNK]) {
1355            if (s->indentation >= 4 && lexer->lookahead != '\n' &&
1356                lexer->lookahead != '\r') {
1357                lexer->result_symbol = INDENTED_CHUNK_START;
1358                if (!s->simulate)
1359                    push_block(s, INDENTED_CODE_BLOCK);
1360                s->indentation -= 4;
1361                return true;
1362            }
1363        }
1364        // Decide which tokens to consider based on the first non-whitespace
1365        // character
1366        switch (lexer->lookahead) {
1367            case '\r':
1368            case '\n':
1369                if (valid_symbols[BLANK_LINE_START]) {
1370                    // A blank line token is actually just 0 width, so do not
1371                    // consume the characters
1372                    lexer->result_symbol = BLANK_LINE_START;
1373                    return true;
1374                }
1375                break;
1376            case '`':
1377                // A backtick could mark the beginning or ending of a fenced
1378                // code block.
1379                return parse_fenced_code_block(s, '`', lexer, valid_symbols);
1380            case '~':
1381                // A tilde could mark the beginning or ending of a fenced code
1382                // block.
1383                return parse_fenced_code_block(s, '~', lexer, valid_symbols);
1384            case '*':
1385                // A star could either mark  a list item or a thematic break.
1386                // This code is similar to the code for '_' and '+'.
1387                return parse_star(s, lexer, valid_symbols);
1388            case '_':
1389                return parse_thematic_break_underscore(s, lexer, valid_symbols);
1390            case '>':
1391                // A '>' could mark the beginning of a block quote
1392                return parse_block_quote(s, lexer, valid_symbols);
1393            case '#':
1394                // A '#' could mark a atx heading
1395                return parse_atx_heading(s, lexer, valid_symbols);
1396            case '=':
1397                // A '=' could mark a setext underline
1398                return parse_setext_underline(s, lexer, valid_symbols);
1399            case '+':
1400                // A '+' could be a list marker
1401                return parse_plus(s, lexer, valid_symbols);
1402            case '0':
1403            case '1':
1404            case '2':
1405            case '3':
1406            case '4':
1407            case '5':
1408            case '6':
1409            case '7':
1410            case '8':
1411            case '9':
1412                // A number could be a list marker (if followed by a dot or a
1413                // parenthesis)
1414                return parse_ordered_list_marker(s, lexer, valid_symbols);
1415            case '-':
1416                // A minus could mark a list marker, a thematic break or a
1417                // setext underline
1418                return parse_minus(s, lexer, valid_symbols);
1419            case '<':
1420                // A < could mark the beginning of a html block
1421                return parse_html_block(s, lexer, valid_symbols);
1422        }
1423        if (lexer->lookahead != '\r' && lexer->lookahead != '\n' &&
1424            valid_symbols[PIPE_TABLE_START]) {
1425            return parse_pipe_table(s, lexer, valid_symbols);
1426        }
1427    } else { // we are in the state of trying to match all currently open blocks
1428        bool partial_success = false;
1429        while (s->matched < (uint8_t)s->open_blocks.size) {
1430            if (s->matched == (uint8_t)s->open_blocks.size - 1 &&
1431                (s->state & STATE_CLOSE_BLOCK)) {
1432                if (!partial_success)
1433                    s->state &= ~STATE_CLOSE_BLOCK;
1434                break;
1435            }
1436            if (match(s, lexer, s->open_blocks.items[s->matched])) {
1437                partial_success = true;
1438                s->matched++;
1439            } else {
1440                if (s->state & STATE_WAS_SOFT_LINE_BREAK) {
1441                    s->state &= (~STATE_MATCHING);
1442                }
1443                break;
1444            }
1445        }
1446        if (partial_success) {
1447            if (s->matched == s->open_blocks.size) {
1448                s->state &= (~STATE_MATCHING);
1449            }
1450            lexer->result_symbol = BLOCK_CONTINUATION;
1451            return true;
1452        }
1453
1454        if (!(s->state & STATE_WAS_SOFT_LINE_BREAK)) {
1455            lexer->result_symbol = BLOCK_CLOSE;
1456            pop_block(s);
1457            if (s->matched == s->open_blocks.size) {
1458                s->state &= (~STATE_MATCHING);
1459            }
1460            return true;
1461        }
1462    }
1463
1464    // The parser just encountered a line break. Setup the state correspondingly
1465    if ((valid_symbols[LINE_ENDING] || valid_symbols[SOFT_LINE_ENDING] ||
1466         valid_symbols[PIPE_TABLE_LINE_ENDING]) &&
1467        (lexer->lookahead == '\n' || lexer->lookahead == '\r')) {
1468        if (lexer->lookahead == '\r') {
1469            advance(s, lexer);
1470            if (lexer->lookahead == '\n') {
1471                advance(s, lexer);
1472            }
1473        } else {
1474            advance(s, lexer);
1475        }
1476        s->indentation = 0;
1477        s->column = 0;
1478        if (!(s->state & STATE_CLOSE_BLOCK) &&
1479            (valid_symbols[SOFT_LINE_ENDING] ||
1480             valid_symbols[PIPE_TABLE_LINE_ENDING])) {
1481            lexer->mark_end(lexer);
1482            for (;;) {
1483                if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1484                    s->indentation += advance(s, lexer);
1485                } else {
1486                    break;
1487                }
1488            }
1489            s->simulate = true;
1490            uint8_t matched_temp = s->matched;
1491            s->matched = 0;
1492            bool one_will_be_matched = false;
1493            while (s->matched < (uint8_t)s->open_blocks.size) {
1494                if (match(s, lexer, s->open_blocks.items[s->matched])) {
1495                    s->matched++;
1496                    one_will_be_matched = true;
1497                } else {
1498                    break;
1499                }
1500            }
1501            bool all_will_be_matched = s->matched == s->open_blocks.size;
1502            if (!lexer->eof(lexer) &&
1503                !scan(s, lexer, paragraph_interrupt_symbols)) {
1504                s->matched = matched_temp;
1505                // If the last line break ended a paragraph and no new block
1506                // opened, the last line break should have been a soft line
1507                // break Reset the counter for matched blocks
1508                s->matched = 0;
1509                s->indentation = 0;
1510                s->column = 0;
1511                // If there is at least one open block, we should be in the
1512                // matching state. Also set the matching flag if a
1513                // `$._soft_line_break_marker` can be emitted so it does get
1514                // emitted.
1515                if (one_will_be_matched) {
1516                    s->state |= STATE_MATCHING;
1517                } else {
1518                    s->state &= (~STATE_MATCHING);
1519                }
1520                if (valid_symbols[PIPE_TABLE_LINE_ENDING]) {
1521                    if (all_will_be_matched) {
1522                        lexer->result_symbol = PIPE_TABLE_LINE_ENDING;
1523                        return true;
1524                    }
1525                } else {
1526                    lexer->result_symbol = SOFT_LINE_ENDING;
1527                    // reset some state variables
1528                    s->state |= STATE_WAS_SOFT_LINE_BREAK;
1529                    return true;
1530                }
1531            } else {
1532                s->matched = matched_temp;
1533            }
1534            s->indentation = 0;
1535            s->column = 0;
1536        }
1537        if (valid_symbols[LINE_ENDING]) {
1538            // If the last line break ended a paragraph and no new block opened,
1539            // the last line break should have been a soft line break Reset the
1540            // counter for matched blocks
1541            s->matched = 0;
1542            // If there is at least one open block, we should be in the matching
1543            // state. Also set the matching flag if a
1544            // `$._soft_line_break_marker` can be emitted so it does get
1545            // emitted.
1546            if (s->open_blocks.size > 0) {
1547                s->state |= STATE_MATCHING;
1548            } else {
1549                s->state &= (~STATE_MATCHING);
1550            }
1551            // reset some state variables
1552            s->state &= (~STATE_WAS_SOFT_LINE_BREAK);
1553            lexer->result_symbol = LINE_ENDING;
1554            return true;
1555        }
1556    }
1557    return false;
1558}
1559
1560void *tree_sitter_markdown_external_scanner_create(void) {
1561    Scanner *s = (Scanner *)malloc(sizeof(Scanner));
1562    s->open_blocks.items = (Block *)calloc(1, sizeof(Block));
1563#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
1564    _Static_assert(ATX_H6_MARKER == ATX_H1_MARKER + 5, "");
1565#else
1566    assert(ATX_H6_MARKER == ATX_H1_MARKER + 5);
1567#endif
1568    deserialize(s, NULL, 0);
1569
1570    return s;
1571}
1572
1573bool tree_sitter_markdown_external_scanner_scan(void *payload, TSLexer *lexer,
1574                                                const bool *valid_symbols) {
1575    Scanner *scanner = (Scanner *)payload;
1576    scanner->simulate = false;
1577    return scan(scanner, lexer, valid_symbols);
1578}
1579
1580unsigned tree_sitter_markdown_external_scanner_serialize(void *payload,
1581                                                         char *buffer) {
1582    Scanner *scanner = (Scanner *)payload;
1583    return serialize(scanner, buffer);
1584}
1585
1586void tree_sitter_markdown_external_scanner_deserialize(void *payload,
1587                                                       char *buffer,
1588                                                       unsigned length) {
1589    Scanner *scanner = (Scanner *)payload;
1590    deserialize(scanner, buffer, length);
1591}
1592
1593void tree_sitter_markdown_external_scanner_destroy(void *payload) {
1594    Scanner *scanner = (Scanner *)payload;
1595    free(scanner->open_blocks.items);
1596    free(scanner);
1597}