aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/mitjafelicijan/go-tree-sitter/markdown/tree-sitter-markdown/scanner.c
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/mitjafelicijan/go-tree-sitter/markdown/tree-sitter-markdown/scanner.c')
-rw-r--r--vendor/github.com/mitjafelicijan/go-tree-sitter/markdown/tree-sitter-markdown/scanner.c1597
1 files changed, 1597 insertions, 0 deletions
diff --git a/vendor/github.com/mitjafelicijan/go-tree-sitter/markdown/tree-sitter-markdown/scanner.c b/vendor/github.com/mitjafelicijan/go-tree-sitter/markdown/tree-sitter-markdown/scanner.c
new file mode 100644
index 0000000..748fe17
--- /dev/null
+++ b/vendor/github.com/mitjafelicijan/go-tree-sitter/markdown/tree-sitter-markdown/scanner.c
@@ -0,0 +1,1597 @@
1#include "parser.h"
2#include <assert.h>
3#include <ctype.h>
4#include <string.h>
5#include <wchar.h>
6#include <wctype.h>
7
8// For explanation of the tokens see grammar.js
9typedef enum {
10 LINE_ENDING,
11 SOFT_LINE_ENDING,
12 BLOCK_CLOSE,
13 BLOCK_CONTINUATION,
14 BLOCK_QUOTE_START,
15 INDENTED_CHUNK_START,
16 ATX_H1_MARKER,
17 ATX_H2_MARKER,
18 ATX_H3_MARKER,
19 ATX_H4_MARKER,
20 ATX_H5_MARKER,
21 ATX_H6_MARKER,
22 SETEXT_H1_UNDERLINE,
23 SETEXT_H2_UNDERLINE,
24 THEMATIC_BREAK,
25 LIST_MARKER_MINUS,
26 LIST_MARKER_PLUS,
27 LIST_MARKER_STAR,
28 LIST_MARKER_PARENTHESIS,
29 LIST_MARKER_DOT,
30 LIST_MARKER_MINUS_DONT_INTERRUPT,
31 LIST_MARKER_PLUS_DONT_INTERRUPT,
32 LIST_MARKER_STAR_DONT_INTERRUPT,
33 LIST_MARKER_PARENTHESIS_DONT_INTERRUPT,
34 LIST_MARKER_DOT_DONT_INTERRUPT,
35 FENCED_CODE_BLOCK_START_BACKTICK,
36 FENCED_CODE_BLOCK_START_TILDE,
37 BLANK_LINE_START,
38 FENCED_CODE_BLOCK_END_BACKTICK,
39 FENCED_CODE_BLOCK_END_TILDE,
40 HTML_BLOCK_1_START,
41 HTML_BLOCK_1_END,
42 HTML_BLOCK_2_START,
43 HTML_BLOCK_3_START,
44 HTML_BLOCK_4_START,
45 HTML_BLOCK_5_START,
46 HTML_BLOCK_6_START,
47 HTML_BLOCK_7_START,
48 CLOSE_BLOCK,
49 NO_INDENTED_CHUNK,
50 ERROR,
51 TRIGGER_ERROR,
52 TOKEN_EOF,
53 MINUS_METADATA,
54 PLUS_METADATA,
55 PIPE_TABLE_START,
56 PIPE_TABLE_LINE_ENDING,
57} TokenType;
58
59// Description of a block on the block stack.
60//
61// LIST_ITEM is a list item with minimal indentation (content begins at indent
62// level 2) while LIST_ITEM_MAX_INDENTATION represents a list item with maximal
63// indentation without being considered a indented code block.
64//
65// ANONYMOUS represents any block that whose close is not handled by the
66// external s.
67typedef enum {
68 BLOCK_QUOTE,
69 INDENTED_CODE_BLOCK,
70 LIST_ITEM,
71 LIST_ITEM_1_INDENTATION,
72 LIST_ITEM_2_INDENTATION,
73 LIST_ITEM_3_INDENTATION,
74 LIST_ITEM_4_INDENTATION,
75 LIST_ITEM_5_INDENTATION,
76 LIST_ITEM_6_INDENTATION,
77 LIST_ITEM_7_INDENTATION,
78 LIST_ITEM_8_INDENTATION,
79 LIST_ITEM_9_INDENTATION,
80 LIST_ITEM_10_INDENTATION,
81 LIST_ITEM_11_INDENTATION,
82 LIST_ITEM_12_INDENTATION,
83 LIST_ITEM_13_INDENTATION,
84 LIST_ITEM_14_INDENTATION,
85 LIST_ITEM_MAX_INDENTATION,
86 FENCED_CODE_BLOCK,
87 ANONYMOUS,
88} Block;
89
90// Determines if a character is punctuation as defined by the markdown spec.
91static bool is_punctuation(char chr) {
92 return (chr >= '!' && chr <= '/') || (chr >= ':' && chr <= '@') ||
93 (chr >= '[' && chr <= '`') || (chr >= '{' && chr <= '~');
94}
95
96// Returns the indentation level which lines of a list item should have at
97// minimum. Should only be called with blocks for which `is_list_item` returns
98// true.
99static uint8_t list_item_indentation(Block block) {
100 return (uint8_t)(block - LIST_ITEM + 2);
101}
102
103#define NUM_HTML_TAG_NAMES_RULE_1 3
104
105static const char *const HTML_TAG_NAMES_RULE_1[NUM_HTML_TAG_NAMES_RULE_1] = {
106 "pre", "script", "style"};
107
108#define NUM_HTML_TAG_NAMES_RULE_7 62
109
110static const char *const HTML_TAG_NAMES_RULE_7[NUM_HTML_TAG_NAMES_RULE_7] = {
111 "address", "article", "aside", "base", "basefont", "blockquote",
112 "body", "caption", "center", "col", "colgroup", "dd",
113 "details", "dialog", "dir", "div", "dl", "dt",
114 "fieldset", "figcaption", "figure", "footer", "form", "frame",
115 "frameset", "h1", "h2", "h3", "h4", "h5",
116 "h6", "head", "header", "hr", "html", "iframe",
117 "legend", "li", "link", "main", "menu", "menuitem",
118 "nav", "noframes", "ol", "optgroup", "option", "p",
119 "param", "section", "source", "summary", "table", "tbody",
120 "td", "tfoot", "th", "thead", "title", "tr",
121 "track", "ul"};
122
123// For explanation of the tokens see grammar.js
124static const bool paragraph_interrupt_symbols[] = {
125 false, // LINE_ENDING,
126 false, // SOFT_LINE_ENDING,
127 false, // BLOCK_CLOSE,
128 false, // BLOCK_CONTINUATION,
129 true, // BLOCK_QUOTE_START,
130 false, // INDENTED_CHUNK_START,
131 true, // ATX_H1_MARKER,
132 true, // ATX_H2_MARKER,
133 true, // ATX_H3_MARKER,
134 true, // ATX_H4_MARKER,
135 true, // ATX_H5_MARKER,
136 true, // ATX_H6_MARKER,
137 true, // SETEXT_H1_UNDERLINE,
138 true, // SETEXT_H2_UNDERLINE,
139 true, // THEMATIC_BREAK,
140 true, // LIST_MARKER_MINUS,
141 true, // LIST_MARKER_PLUS,
142 true, // LIST_MARKER_STAR,
143 true, // LIST_MARKER_PARENTHESIS,
144 true, // LIST_MARKER_DOT,
145 false, // LIST_MARKER_MINUS_DONT_INTERRUPT,
146 false, // LIST_MARKER_PLUS_DONT_INTERRUPT,
147 false, // LIST_MARKER_STAR_DONT_INTERRUPT,
148 false, // LIST_MARKER_PARENTHESIS_DONT_INTERRUPT,
149 false, // LIST_MARKER_DOT_DONT_INTERRUPT,
150 true, // FENCED_CODE_BLOCK_START_BACKTICK,
151 true, // FENCED_CODE_BLOCK_START_TILDE,
152 true, // BLANK_LINE_START,
153 false, // FENCED_CODE_BLOCK_END_BACKTICK,
154 false, // FENCED_CODE_BLOCK_END_TILDE,
155 true, // HTML_BLOCK_1_START,
156 false, // HTML_BLOCK_1_END,
157 true, // HTML_BLOCK_2_START,
158 true, // HTML_BLOCK_3_START,
159 true, // HTML_BLOCK_4_START,
160 true, // HTML_BLOCK_5_START,
161 true, // HTML_BLOCK_6_START,
162 false, // HTML_BLOCK_7_START,
163 false, // CLOSE_BLOCK,
164 false, // NO_INDENTED_CHUNK,
165 false, // ERROR,
166 false, // TRIGGER_ERROR,
167 false, // EOF,
168 false, // MINUS_METADATA,
169 false, // PLUS_METADATA,
170 true, // PIPE_TABLE_START,
171 false, // PIPE_TABLE_LINE_ENDING,
172};
173
174// State bitflags used with `Scanner.state`
175
176// Currently matching (at the beginning of a line)
177static const uint8_t STATE_MATCHING = 0x1 << 0;
178// Last line break was inside a paragraph
179static const uint8_t STATE_WAS_SOFT_LINE_BREAK = 0x1 << 1;
180// Block should be closed after next line break
181static const uint8_t STATE_CLOSE_BLOCK = 0x1 << 4;
182
183static size_t roundup_32(size_t x) {
184 x--;
185
186 x |= x >> 1;
187 x |= x >> 2;
188 x |= x >> 4;
189 x |= x >> 8;
190 x |= x >> 16;
191
192 x++;
193
194 return x;
195}
196
197typedef struct {
198 // A stack of open blocks in the current parse state
199 struct {
200 size_t size;
201 size_t capacity;
202 Block *items;
203 } open_blocks;
204
205 // Parser state flags
206 uint8_t state;
207 // Number of blocks that have been matched so far. Only changes during
208 // matching and is reset after every line ending
209 uint8_t matched;
210 // Consumed but "unused" indentation. Sometimes a tab needs to be "split" to
211 // be used in multiple tokens.
212 uint8_t indentation;
213 // The current column. Used to decide how many spaces a tab should equal
214 uint8_t column;
215 // The delimiter length of the currently open fenced code block
216 uint8_t fenced_code_block_delimiter_length;
217
218 bool simulate;
219} Scanner;
220
221static void push_block(Scanner *s, Block b) {
222 if (s->open_blocks.size == s->open_blocks.capacity) {
223 s->open_blocks.capacity =
224 s->open_blocks.capacity ? s->open_blocks.capacity << 1 : 8;
225 void *tmp = realloc(s->open_blocks.items,
226 sizeof(Block) * s->open_blocks.capacity);
227 assert(tmp != NULL);
228 s->open_blocks.items = tmp;
229 }
230
231 s->open_blocks.items[s->open_blocks.size++] = b;
232}
233
234static inline Block pop_block(Scanner *s) {
235 return s->open_blocks.items[--s->open_blocks.size];
236}
237
238// Write the whole state of a Scanner to a byte buffer
239static unsigned serialize(Scanner *s, char *buffer) {
240 unsigned size = 0;
241 buffer[size++] = (char)s->state;
242 buffer[size++] = (char)s->matched;
243 buffer[size++] = (char)s->indentation;
244 buffer[size++] = (char)s->column;
245 buffer[size++] = (char)s->fenced_code_block_delimiter_length;
246 size_t blocks_count = s->open_blocks.size;
247 if (blocks_count > 0) {
248 memcpy(&buffer[size], s->open_blocks.items,
249 blocks_count * sizeof(Block));
250 size += blocks_count * sizeof(Block);
251 }
252 return size;
253}
254
255// Read the whole state of a Scanner from a byte buffer
256// `serizalize` and `deserialize` should be fully symmetric.
257static void deserialize(Scanner *s, const char *buffer, unsigned length) {
258 s->open_blocks.size = 0;
259 s->open_blocks.capacity = 0;
260 s->state = 0;
261 s->matched = 0;
262 s->indentation = 0;
263 s->column = 0;
264 s->fenced_code_block_delimiter_length = 0;
265 if (length > 0) {
266 size_t size = 0;
267 s->state = (uint8_t)buffer[size++];
268 s->matched = (uint8_t)buffer[size++];
269 s->indentation = (uint8_t)buffer[size++];
270 s->column = (uint8_t)buffer[size++];
271 s->fenced_code_block_delimiter_length = (uint8_t)buffer[size++];
272 size_t blocks_size = length - size;
273 if (blocks_size > 0) {
274 size_t blocks_count = blocks_size / sizeof(Block);
275
276 // ensure open blocks has enough room
277 if (s->open_blocks.capacity < blocks_count) {
278 size_t capacity = roundup_32(blocks_count);
279 void *tmp = realloc(s->open_blocks.items,
280 sizeof(Block) * capacity);
281 assert(tmp != NULL);
282 s->open_blocks.items = tmp;
283 s->open_blocks.capacity = capacity;
284 }
285 memcpy(s->open_blocks.items, &buffer[size], blocks_size);
286 s->open_blocks.size = blocks_count;
287 }
288 }
289}
290
291static void mark_end(Scanner *s, TSLexer *lexer) {
292 if (!s->simulate) {
293 lexer->mark_end(lexer);
294 }
295}
296
297// Convenience function to emit the error token. This is done to stop invalid
298// parse branches. Specifically:
299// 1. When encountering a newline after a line break that ended a paragraph, and
300// no new block
301// has been opened.
302// 2. When encountering a new block after a soft line break.
303// 3. When a `$._trigger_error` token is valid, which is used to stop parse
304// branches through
305// normal tree-sitter grammar rules.
306//
307// See also the `$._soft_line_break` and `$._paragraph_end_newline` tokens in
308// grammar.js
309static bool error(TSLexer *lexer) {
310 lexer->result_symbol = ERROR;
311 return true;
312}
313
314// Advance the lexer one character
315// Also keeps track of the current column, counting tabs as spaces with tab stop
316// 4 See https://github.github.com/gfm/#tabs
317static size_t advance(Scanner *s, TSLexer *lexer) {
318 size_t size = 1;
319 if (lexer->lookahead == '\t') {
320 size = 4 - s->column;
321 s->column = 0;
322 } else {
323 s->column = (s->column + 1) % 4;
324 }
325 lexer->advance(lexer, false);
326 return size;
327}
328
329// Try to match the given block, i.e. consume all tokens that belong to the
330// block. These are
331// 1. indentation for list items and indented code blocks
332// 2. '>' for block quotes
333// Returns true if the block is matched and false otherwise
334static bool match(Scanner *s, TSLexer *lexer, Block block) {
335 switch (block) {
336 case INDENTED_CODE_BLOCK:
337 while (s->indentation < 4) {
338 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
339 s->indentation += advance(s, lexer);
340 } else {
341 break;
342 }
343 }
344 if (s->indentation >= 4 && lexer->lookahead != '\n' &&
345 lexer->lookahead != '\r') {
346 s->indentation -= 4;
347 return true;
348 }
349 break;
350 case LIST_ITEM:
351 case LIST_ITEM_1_INDENTATION:
352 case LIST_ITEM_2_INDENTATION:
353 case LIST_ITEM_3_INDENTATION:
354 case LIST_ITEM_4_INDENTATION:
355 case LIST_ITEM_5_INDENTATION:
356 case LIST_ITEM_6_INDENTATION:
357 case LIST_ITEM_7_INDENTATION:
358 case LIST_ITEM_8_INDENTATION:
359 case LIST_ITEM_9_INDENTATION:
360 case LIST_ITEM_10_INDENTATION:
361 case LIST_ITEM_11_INDENTATION:
362 case LIST_ITEM_12_INDENTATION:
363 case LIST_ITEM_13_INDENTATION:
364 case LIST_ITEM_14_INDENTATION:
365 case LIST_ITEM_MAX_INDENTATION:
366 while (s->indentation < list_item_indentation(block)) {
367 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
368 s->indentation += advance(s, lexer);
369 } else {
370 break;
371 }
372 }
373 if (s->indentation >= list_item_indentation(block)) {
374 s->indentation -= list_item_indentation(block);
375 return true;
376 }
377 if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
378 s->indentation = 0;
379 return true;
380 }
381 break;
382 case BLOCK_QUOTE:
383 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
384 s->indentation += advance(s, lexer);
385 }
386 if (lexer->lookahead == '>') {
387 advance(s, lexer);
388 s->indentation = 0;
389 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
390 s->indentation += advance(s, lexer) - 1;
391 }
392 return true;
393 }
394 break;
395 case FENCED_CODE_BLOCK:
396 case ANONYMOUS:
397 return true;
398 }
399 return false;
400}
401
402static bool parse_fenced_code_block(Scanner *s, const char delimiter,
403 TSLexer *lexer, const bool *valid_symbols) {
404 // count the number of backticks
405 uint8_t level = 0;
406 while (lexer->lookahead == delimiter) {
407 advance(s, lexer);
408 level++;
409 }
410 mark_end(s, lexer);
411 // If this is able to close a fenced code block then that is the only valid
412 // interpretation. It can only close a fenced code block if the number of
413 // backticks is at least the number of backticks of the opening delimiter.
414 // Also it cannot be indented more than 3 spaces.
415 if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_END_BACKTICK]
416 : valid_symbols[FENCED_CODE_BLOCK_END_TILDE]) &&
417 s->indentation < 4 && level >= s->fenced_code_block_delimiter_length &&
418 (lexer->lookahead == '\n' || lexer->lookahead == '\r')) {
419 s->fenced_code_block_delimiter_length = 0;
420 lexer->result_symbol = delimiter == '`' ? FENCED_CODE_BLOCK_END_BACKTICK
421 : FENCED_CODE_BLOCK_END_TILDE;
422 return true;
423 }
424 // If this could be the start of a fenced code block, check if the info
425 // string contains any backticks.
426 if ((delimiter == '`' ? valid_symbols[FENCED_CODE_BLOCK_START_BACKTICK]
427 : valid_symbols[FENCED_CODE_BLOCK_START_TILDE]) &&
428 level >= 3) {
429 bool info_string_has_backtick = false;
430 if (delimiter == '`') {
431 while (lexer->lookahead != '\n' && lexer->lookahead != '\r' &&
432 !lexer->eof(lexer)) {
433 if (lexer->lookahead == '`') {
434 info_string_has_backtick = true;
435 break;
436 }
437 advance(s, lexer);
438 }
439 }
440 // If it does not then choose to interpret this as the start of a fenced
441 // code block.
442 if (!info_string_has_backtick) {
443 lexer->result_symbol = delimiter == '`'
444 ? FENCED_CODE_BLOCK_START_BACKTICK
445 : FENCED_CODE_BLOCK_START_TILDE;
446 if (!s->simulate)
447 push_block(s, FENCED_CODE_BLOCK);
448 // Remember the length of the delimiter for later, since we need it
449 // to decide whether a sequence of backticks can close the block.
450 s->fenced_code_block_delimiter_length = level;
451 s->indentation = 0;
452 return true;
453 }
454 }
455 return false;
456}
457
458static bool parse_star(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
459 advance(s, lexer);
460 mark_end(s, lexer);
461 // Otherwise count the number of stars permitting whitespaces between them.
462 size_t star_count = 1;
463 // Also remember how many stars there are before the first whitespace...
464 // ...and how many spaces follow the first star.
465 uint8_t extra_indentation = 0;
466 for (;;) {
467 if (lexer->lookahead == '*') {
468 if (star_count == 1 && extra_indentation >= 1 &&
469 valid_symbols[LIST_MARKER_STAR]) {
470 // If we get to this point then the token has to be at least
471 // this long. We need to call `mark_end` here in case we decide
472 // later that this is a list item.
473 mark_end(s, lexer);
474 }
475 star_count++;
476 advance(s, lexer);
477 } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
478 if (star_count == 1) {
479 extra_indentation += advance(s, lexer);
480 } else {
481 advance(s, lexer);
482 }
483 } else {
484 break;
485 }
486 }
487 bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r';
488 bool dont_interrupt = false;
489 if (star_count == 1 && line_end) {
490 extra_indentation = 1;
491 // line is empty so don't interrupt paragraphs if this is a list marker
492 dont_interrupt = s->matched == s->open_blocks.size;
493 }
494 // If there were at least 3 stars then this could be a thematic break
495 bool thematic_break = star_count >= 3 && line_end;
496 // If there was a star and at least one space after that star then this
497 // could be a list marker.
498 bool list_marker_star = star_count >= 1 && extra_indentation >= 1;
499 if (valid_symbols[THEMATIC_BREAK] && thematic_break && s->indentation < 4) {
500 // If a thematic break is valid then it takes precedence
501 lexer->result_symbol = THEMATIC_BREAK;
502 mark_end(s, lexer);
503 s->indentation = 0;
504 return true;
505 }
506 if ((dont_interrupt ? valid_symbols[LIST_MARKER_STAR_DONT_INTERRUPT]
507 : valid_symbols[LIST_MARKER_STAR]) &&
508 list_marker_star) {
509 // List markers take precedence over emphasis markers
510 // If star_count > 1 then we already called mark_end at the right point.
511 // Otherwise the token should go until this point.
512 if (star_count == 1) {
513 mark_end(s, lexer);
514 }
515 // Not counting one space...
516 extra_indentation--;
517 // ... check if the list item begins with an indented code block
518 if (extra_indentation <= 3) {
519 // If not then calculate the indentation level of the list item
520 // content as indentation of list marker + indentation after list
521 // marker - 1
522 extra_indentation += s->indentation;
523 s->indentation = 0;
524 } else {
525 // Otherwise the indentation level is just the indentation of the
526 // list marker. We keep the indentation after the list marker for
527 // later blocks.
528 uint8_t temp = s->indentation;
529 s->indentation = extra_indentation;
530 extra_indentation = temp;
531 }
532 if (!s->simulate)
533 push_block(s, (Block)(LIST_ITEM + extra_indentation));
534 lexer->result_symbol =
535 dont_interrupt ? LIST_MARKER_STAR_DONT_INTERRUPT : LIST_MARKER_STAR;
536 return true;
537 }
538 return false;
539}
540
541static bool parse_thematic_break_underscore(Scanner *s, TSLexer *lexer,
542 const bool *valid_symbols) {
543 advance(s, lexer);
544 mark_end(s, lexer);
545 size_t underscore_count = 1;
546 for (;;) {
547 if (lexer->lookahead == '_') {
548 underscore_count++;
549 advance(s, lexer);
550 } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
551 advance(s, lexer);
552 } else {
553 break;
554 }
555 }
556 bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r';
557 if (underscore_count >= 3 && line_end && valid_symbols[THEMATIC_BREAK]) {
558 lexer->result_symbol = THEMATIC_BREAK;
559 mark_end(s, lexer);
560 s->indentation = 0;
561 return true;
562 }
563 return false;
564}
565
566static bool parse_block_quote(Scanner *s, TSLexer *lexer,
567 const bool *valid_symbols) {
568 if (valid_symbols[BLOCK_QUOTE_START]) {
569 advance(s, lexer);
570 s->indentation = 0;
571 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
572 s->indentation += advance(s, lexer) - 1;
573 }
574 lexer->result_symbol = BLOCK_QUOTE_START;
575 if (!s->simulate)
576 push_block(s, BLOCK_QUOTE);
577 return true;
578 }
579 return false;
580}
581
582static bool parse_atx_heading(Scanner *s, TSLexer *lexer,
583 const bool *valid_symbols) {
584 if (valid_symbols[ATX_H1_MARKER] && s->indentation <= 3) {
585 mark_end(s, lexer);
586 uint16_t level = 0;
587 while (lexer->lookahead == '#' && level <= 6) {
588 advance(s, lexer);
589 level++;
590 }
591 if (level <= 6 &&
592 (lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
593 lexer->lookahead == '\n' || lexer->lookahead == '\r')) {
594 lexer->result_symbol = ATX_H1_MARKER + (level - 1);
595 s->indentation = 0;
596 mark_end(s, lexer);
597 return true;
598 }
599 }
600 return false;
601}
602
603static bool parse_setext_underline(Scanner *s, TSLexer *lexer,
604 const bool *valid_symbols) {
605 if (valid_symbols[SETEXT_H1_UNDERLINE] &&
606 s->matched == s->open_blocks.size) {
607 mark_end(s, lexer);
608 while (lexer->lookahead == '=') {
609 advance(s, lexer);
610 }
611 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
612 advance(s, lexer);
613 }
614 if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
615 lexer->result_symbol = SETEXT_H1_UNDERLINE;
616 mark_end(s, lexer);
617 return true;
618 }
619 }
620 return false;
621}
622
623static bool parse_plus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
624 if (s->indentation <= 3 &&
625 (valid_symbols[LIST_MARKER_PLUS] ||
626 valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT] ||
627 valid_symbols[PLUS_METADATA])) {
628 advance(s, lexer);
629 if (valid_symbols[PLUS_METADATA] && lexer->lookahead == '+') {
630 advance(s, lexer);
631 if (lexer->lookahead != '+') {
632 return false;
633 }
634 advance(s, lexer);
635 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
636 advance(s, lexer);
637 }
638 if (lexer->lookahead != '\n' && lexer->lookahead != '\r') {
639 return false;
640 }
641 for (;;) {
642 // advance over newline
643 if (lexer->lookahead == '\r') {
644 advance(s, lexer);
645 if (lexer->lookahead == '\n') {
646 advance(s, lexer);
647 }
648 } else {
649 advance(s, lexer);
650 }
651 // check for pluses
652 size_t plus_count = 0;
653 while (lexer->lookahead == '+') {
654 plus_count++;
655 advance(s, lexer);
656 }
657 if (plus_count == 3) {
658 // if exactly 3 check if next symbol (after eventual
659 // whitespace) is newline
660 while (lexer->lookahead == ' ' ||
661 lexer->lookahead == '\t') {
662 advance(s, lexer);
663 }
664 if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
665 // if so also consume newline
666 if (lexer->lookahead == '\r') {
667 advance(s, lexer);
668 if (lexer->lookahead == '\n') {
669 advance(s, lexer);
670 }
671 } else {
672 advance(s, lexer);
673 }
674 mark_end(s, lexer);
675 lexer->result_symbol = PLUS_METADATA;
676 return true;
677 }
678 }
679 // otherwise consume rest of line
680 while (lexer->lookahead != '\n' && lexer->lookahead != '\r' &&
681 !lexer->eof(lexer)) {
682 advance(s, lexer);
683 }
684 // if end of file is reached, then this is not metadata
685 if (lexer->eof(lexer)) {
686 break;
687 }
688 }
689 } else {
690 uint8_t extra_indentation = 0;
691 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
692 extra_indentation += advance(s, lexer);
693 }
694 bool dont_interrupt = false;
695 if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
696 extra_indentation = 1;
697 dont_interrupt = true;
698 }
699 dont_interrupt =
700 dont_interrupt && s->matched == s->open_blocks.size;
701 if (extra_indentation >= 1 &&
702 (dont_interrupt ? valid_symbols[LIST_MARKER_PLUS_DONT_INTERRUPT]
703 : valid_symbols[LIST_MARKER_PLUS])) {
704 lexer->result_symbol = dont_interrupt
705 ? LIST_MARKER_PLUS_DONT_INTERRUPT
706 : LIST_MARKER_PLUS;
707 extra_indentation--;
708 if (extra_indentation <= 3) {
709 extra_indentation += s->indentation;
710 s->indentation = 0;
711 } else {
712 uint8_t temp = s->indentation;
713 s->indentation = extra_indentation;
714 extra_indentation = temp;
715 }
716 if (!s->simulate)
717 push_block(s, (Block)(LIST_ITEM + extra_indentation));
718 return true;
719 }
720 }
721 }
722 return false;
723}
724
725static bool parse_ordered_list_marker(Scanner *s, TSLexer *lexer,
726 const bool *valid_symbols) {
727 if (s->indentation <= 3 &&
728 (valid_symbols[LIST_MARKER_PARENTHESIS] ||
729 valid_symbols[LIST_MARKER_DOT] ||
730 valid_symbols[LIST_MARKER_PARENTHESIS_DONT_INTERRUPT] ||
731 valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT])) {
732 size_t digits = 1;
733 bool dont_interrupt = lexer->lookahead != '1';
734 advance(s, lexer);
735 while (isdigit(lexer->lookahead)) {
736 dont_interrupt = true;
737 digits++;
738 advance(s, lexer);
739 }
740 if (digits >= 1 && digits <= 9) {
741 bool dot = false;
742 bool parenthesis = false;
743 if (lexer->lookahead == '.') {
744 advance(s, lexer);
745 dot = true;
746 } else if (lexer->lookahead == ')') {
747 advance(s, lexer);
748 parenthesis = true;
749 }
750 if (dot || parenthesis) {
751 uint8_t extra_indentation = 0;
752 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
753 extra_indentation += advance(s, lexer);
754 }
755 bool line_end =
756 lexer->lookahead == '\n' || lexer->lookahead == '\r';
757 if (line_end) {
758 extra_indentation = 1;
759 dont_interrupt = true;
760 }
761 dont_interrupt =
762 dont_interrupt && s->matched == s->open_blocks.size;
763 if (extra_indentation >= 1 &&
764 (dot ? (dont_interrupt
765 ? valid_symbols[LIST_MARKER_DOT_DONT_INTERRUPT]
766 : valid_symbols[LIST_MARKER_DOT])
767 : (dont_interrupt
768 ? valid_symbols
769 [LIST_MARKER_PARENTHESIS_DONT_INTERRUPT]
770 : valid_symbols[LIST_MARKER_PARENTHESIS]))) {
771 lexer->result_symbol =
772 dot ? LIST_MARKER_DOT : LIST_MARKER_PARENTHESIS;
773 extra_indentation--;
774 if (extra_indentation <= 3) {
775 extra_indentation += s->indentation;
776 s->indentation = 0;
777 } else {
778 uint8_t temp = s->indentation;
779 s->indentation = extra_indentation;
780 extra_indentation = temp;
781 }
782 if (!s->simulate)
783 push_block(
784 s, (Block)(LIST_ITEM + extra_indentation + digits));
785 return true;
786 }
787 }
788 }
789 }
790 return false;
791}
792
793static bool parse_minus(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
794 if (s->indentation <= 3 &&
795 (valid_symbols[LIST_MARKER_MINUS] ||
796 valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT] ||
797 valid_symbols[SETEXT_H2_UNDERLINE] || valid_symbols[THEMATIC_BREAK] ||
798 valid_symbols[MINUS_METADATA])) {
799 mark_end(s, lexer);
800 bool whitespace_after_minus = false;
801 bool minus_after_whitespace = false;
802 size_t minus_count = 0;
803 uint8_t extra_indentation = 0;
804
805 for (;;) {
806 if (lexer->lookahead == '-') {
807 if (minus_count == 1 && extra_indentation >= 1) {
808 mark_end(s, lexer);
809 }
810 minus_count++;
811 advance(s, lexer);
812 minus_after_whitespace = whitespace_after_minus;
813 } else if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
814 if (minus_count == 1) {
815 extra_indentation += advance(s, lexer);
816 } else {
817 advance(s, lexer);
818 }
819 whitespace_after_minus = true;
820 } else {
821 break;
822 }
823 }
824 bool line_end = lexer->lookahead == '\n' || lexer->lookahead == '\r';
825 bool dont_interrupt = false;
826 if (minus_count == 1 && line_end) {
827 extra_indentation = 1;
828 dont_interrupt = true;
829 }
830 dont_interrupt = dont_interrupt && s->matched == s->open_blocks.size;
831 bool thematic_break = minus_count >= 3 && line_end;
832 bool underline =
833 minus_count >= 1 && !minus_after_whitespace && line_end &&
834 s->matched ==
835 s->open_blocks
836 .size; // setext heading can not break lazy continuation
837 bool list_marker_minus = minus_count >= 1 && extra_indentation >= 1;
838 bool success = false;
839 if (valid_symbols[SETEXT_H2_UNDERLINE] && underline) {
840 lexer->result_symbol = SETEXT_H2_UNDERLINE;
841 mark_end(s, lexer);
842 s->indentation = 0;
843 success = true;
844 } else if (valid_symbols[THEMATIC_BREAK] &&
845 thematic_break) { // underline is false if list_marker_minus
846 // is true
847 lexer->result_symbol = THEMATIC_BREAK;
848 mark_end(s, lexer);
849 s->indentation = 0;
850 success = true;
851 } else if ((dont_interrupt
852 ? valid_symbols[LIST_MARKER_MINUS_DONT_INTERRUPT]
853 : valid_symbols[LIST_MARKER_MINUS]) &&
854 list_marker_minus) {
855 if (minus_count == 1) {
856 mark_end(s, lexer);
857 }
858 extra_indentation--;
859 if (extra_indentation <= 3) {
860 extra_indentation += s->indentation;
861 s->indentation = 0;
862 } else {
863 uint8_t temp = s->indentation;
864 s->indentation = extra_indentation;
865 extra_indentation = temp;
866 }
867 if (!s->simulate)
868 push_block(s, (Block)(LIST_ITEM + extra_indentation));
869 lexer->result_symbol = dont_interrupt
870 ? LIST_MARKER_MINUS_DONT_INTERRUPT
871 : LIST_MARKER_MINUS;
872 return true;
873 }
874 if (minus_count == 3 && (!minus_after_whitespace) && line_end &&
875 valid_symbols[MINUS_METADATA]) {
876 for (;;) {
877 // advance over newline
878 if (lexer->lookahead == '\r') {
879 advance(s, lexer);
880 if (lexer->lookahead == '\n') {
881 advance(s, lexer);
882 }
883 } else {
884 advance(s, lexer);
885 }
886 // check for minuses
887 minus_count = 0;
888 while (lexer->lookahead == '-') {
889 minus_count++;
890 advance(s, lexer);
891 }
892 if (minus_count == 3) {
893 // if exactly 3 check if next symbol (after eventual
894 // whitespace) is newline
895 while (lexer->lookahead == ' ' ||
896 lexer->lookahead == '\t') {
897 advance(s, lexer);
898 }
899 if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
900 // if so also consume newline
901 if (lexer->lookahead == '\r') {
902 advance(s, lexer);
903 if (lexer->lookahead == '\n') {
904 advance(s, lexer);
905 }
906 } else {
907 advance(s, lexer);
908 }
909 mark_end(s, lexer);
910 lexer->result_symbol = MINUS_METADATA;
911 return true;
912 }
913 }
914 // otherwise consume rest of line
915 while (lexer->lookahead != '\n' && lexer->lookahead != '\r' &&
916 !lexer->eof(lexer)) {
917 advance(s, lexer);
918 }
919 // if end of file is reached, then this is not metadata
920 if (lexer->eof(lexer)) {
921 break;
922 }
923 }
924 }
925 if (success) {
926 return true;
927 }
928 }
929 return false;
930}
931
932static bool parse_html_block(Scanner *s, TSLexer *lexer,
933 const bool *valid_symbols) {
934 if (!(valid_symbols[HTML_BLOCK_1_START] ||
935 valid_symbols[HTML_BLOCK_1_END] ||
936 valid_symbols[HTML_BLOCK_2_START] ||
937 valid_symbols[HTML_BLOCK_3_START] ||
938 valid_symbols[HTML_BLOCK_4_START] ||
939 valid_symbols[HTML_BLOCK_5_START] ||
940 valid_symbols[HTML_BLOCK_6_START] ||
941 valid_symbols[HTML_BLOCK_7_START])) {
942 return false;
943 }
944 advance(s, lexer);
945 if (lexer->lookahead == '?' && valid_symbols[HTML_BLOCK_3_START]) {
946 advance(s, lexer);
947 lexer->result_symbol = HTML_BLOCK_3_START;
948 if (!s->simulate)
949 push_block(s, ANONYMOUS);
950 return true;
951 }
952 if (lexer->lookahead == '!') {
953 // could be block 2
954 advance(s, lexer);
955 if (lexer->lookahead == '-') {
956 advance(s, lexer);
957 if (lexer->lookahead == '-' && valid_symbols[HTML_BLOCK_2_START]) {
958 advance(s, lexer);
959 lexer->result_symbol = HTML_BLOCK_2_START;
960 if (!s->simulate)
961 push_block(s, ANONYMOUS);
962 return true;
963 }
964 } else if ('A' <= lexer->lookahead && lexer->lookahead <= 'Z' &&
965 valid_symbols[HTML_BLOCK_4_START]) {
966 advance(s, lexer);
967 lexer->result_symbol = HTML_BLOCK_4_START;
968 if (!s->simulate)
969 push_block(s, ANONYMOUS);
970 return true;
971 } else if (lexer->lookahead == '[') {
972 advance(s, lexer);
973 if (lexer->lookahead == 'C') {
974 advance(s, lexer);
975 if (lexer->lookahead == 'D') {
976 advance(s, lexer);
977 if (lexer->lookahead == 'A') {
978 advance(s, lexer);
979 if (lexer->lookahead == 'T') {
980 advance(s, lexer);
981 if (lexer->lookahead == 'A') {
982 advance(s, lexer);
983 if (lexer->lookahead == '[' &&
984 valid_symbols[HTML_BLOCK_5_START]) {
985 advance(s, lexer);
986 lexer->result_symbol = HTML_BLOCK_5_START;
987 if (!s->simulate)
988 push_block(s, ANONYMOUS);
989 return true;
990 }
991 }
992 }
993 }
994 }
995 }
996 }
997 }
998 bool starting_slash = lexer->lookahead == '/';
999 if (starting_slash) {
1000 advance(s, lexer);
1001 }
1002 char name[11];
1003 size_t name_length = 0;
1004 while (iswalpha((wint_t)lexer->lookahead)) {
1005 if (name_length < 10) {
1006 name[name_length++] = (char)towlower((wint_t)lexer->lookahead);
1007 } else {
1008 name_length = 12;
1009 }
1010 advance(s, lexer);
1011 }
1012 if (name_length == 0) {
1013 return false;
1014 }
1015 bool tag_closed = false;
1016 if (name_length < 11) {
1017 name[name_length] = 0;
1018 bool next_symbol_valid =
1019 lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
1020 lexer->lookahead == '\n' || lexer->lookahead == '\r' ||
1021 lexer->lookahead == '>';
1022 if (next_symbol_valid) {
1023 // try block 1 names
1024 for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_1; i++) {
1025 if (strcmp(name, HTML_TAG_NAMES_RULE_1[i]) == 0) {
1026 if (starting_slash) {
1027 if (valid_symbols[HTML_BLOCK_1_END]) {
1028 lexer->result_symbol = HTML_BLOCK_1_END;
1029 return true;
1030 }
1031 } else if (valid_symbols[HTML_BLOCK_1_START]) {
1032 lexer->result_symbol = HTML_BLOCK_1_START;
1033 if (!s->simulate)
1034 push_block(s, ANONYMOUS);
1035 return true;
1036 }
1037 }
1038 }
1039 }
1040 if (!next_symbol_valid && lexer->lookahead == '/') {
1041 advance(s, lexer);
1042 if (lexer->lookahead == '>') {
1043 advance(s, lexer);
1044 tag_closed = true;
1045 }
1046 }
1047 if (next_symbol_valid || tag_closed) {
1048 // try block 2 names
1049 for (size_t i = 0; i < NUM_HTML_TAG_NAMES_RULE_7; i++) {
1050 if (strcmp(name, HTML_TAG_NAMES_RULE_7[i]) == 0 &&
1051 valid_symbols[HTML_BLOCK_6_START]) {
1052 lexer->result_symbol = HTML_BLOCK_6_START;
1053 if (!s->simulate)
1054 push_block(s, ANONYMOUS);
1055 return true;
1056 }
1057 }
1058 }
1059 }
1060
1061 if (!valid_symbols[HTML_BLOCK_7_START]) {
1062 return false;
1063 }
1064
1065 if (!tag_closed) {
1066 // tag name (continued)
1067 while (iswalnum((wint_t)lexer->lookahead) || lexer->lookahead == '-') {
1068 advance(s, lexer);
1069 }
1070 if (!starting_slash) {
1071 // attributes
1072 bool had_whitespace = false;
1073 for (;;) {
1074 // whitespace
1075 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1076 had_whitespace = true;
1077 advance(s, lexer);
1078 }
1079 if (lexer->lookahead == '/') {
1080 advance(s, lexer);
1081 break;
1082 }
1083 if (lexer->lookahead == '>') {
1084 break;
1085 }
1086 // attribute name
1087 if (!had_whitespace) {
1088 return false;
1089 }
1090 if (!iswalpha((wint_t)lexer->lookahead) &&
1091 lexer->lookahead != '_' && lexer->lookahead != ':') {
1092 return false;
1093 }
1094 had_whitespace = false;
1095 advance(s, lexer);
1096 while (iswalnum((wint_t)lexer->lookahead) ||
1097 lexer->lookahead == '_' || lexer->lookahead == '.' ||
1098 lexer->lookahead == ':' || lexer->lookahead == '-') {
1099 advance(s, lexer);
1100 }
1101 // attribute value specification
1102 // optional whitespace
1103 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1104 had_whitespace = true;
1105 advance(s, lexer);
1106 }
1107 // =
1108 if (lexer->lookahead == '=') {
1109 advance(s, lexer);
1110 had_whitespace = false;
1111 // optional whitespace
1112 while (lexer->lookahead == ' ' ||
1113 lexer->lookahead == '\t') {
1114 advance(s, lexer);
1115 }
1116 // attribute value
1117 if (lexer->lookahead == '\'' || lexer->lookahead == '"') {
1118 char delimiter = (char)lexer->lookahead;
1119 advance(s, lexer);
1120 while (lexer->lookahead != delimiter &&
1121 lexer->lookahead != '\n' &&
1122 lexer->lookahead != '\r' && !lexer->eof(lexer)) {
1123 advance(s, lexer);
1124 }
1125 if (lexer->lookahead != delimiter) {
1126 return false;
1127 }
1128 advance(s, lexer);
1129 } else {
1130 // unquoted attribute value
1131 bool had_one = false;
1132 while (lexer->lookahead != ' ' &&
1133 lexer->lookahead != '\t' &&
1134 lexer->lookahead != '"' &&
1135 lexer->lookahead != '\'' &&
1136 lexer->lookahead != '=' &&
1137 lexer->lookahead != '<' &&
1138 lexer->lookahead != '>' &&
1139 lexer->lookahead != '`' &&
1140 lexer->lookahead != '\n' &&
1141 lexer->lookahead != '\r' && !lexer->eof(lexer)) {
1142 advance(s, lexer);
1143 had_one = true;
1144 }
1145 if (!had_one) {
1146 return false;
1147 }
1148 }
1149 }
1150 }
1151 } else {
1152 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1153 advance(s, lexer);
1154 }
1155 }
1156 if (lexer->lookahead != '>') {
1157 return false;
1158 }
1159 advance(s, lexer);
1160 }
1161 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1162 advance(s, lexer);
1163 }
1164 if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
1165 lexer->result_symbol = HTML_BLOCK_7_START;
1166 if (!s->simulate)
1167 push_block(s, ANONYMOUS);
1168 return true;
1169 }
1170 return false;
1171}
1172
1173static bool parse_pipe_table(Scanner *s, TSLexer *lexer,
1174 const bool *valid_symbols) {
1175
1176 // unused
1177 (void)(valid_symbols);
1178
1179 // PIPE_TABLE_START is zero width
1180 mark_end(s, lexer);
1181 // count number of cells
1182 size_t cell_count = 0;
1183 // also remember if we see starting and ending pipes, as empty headers have
1184 // to have both
1185 bool starting_pipe = false;
1186 bool ending_pipe = false;
1187 bool empty = true;
1188 if (lexer->lookahead == '|') {
1189 starting_pipe = true;
1190 advance(s, lexer);
1191 }
1192 while (lexer->lookahead != '\r' && lexer->lookahead != '\n' &&
1193 !lexer->eof(lexer)) {
1194 if (lexer->lookahead == '|') {
1195 cell_count++;
1196 ending_pipe = true;
1197 advance(s, lexer);
1198 } else {
1199 if (lexer->lookahead != ' ' && lexer->lookahead != '\t') {
1200 ending_pipe = false;
1201 }
1202 if (lexer->lookahead == '\\') {
1203 advance(s, lexer);
1204 if (is_punctuation((char)lexer->lookahead)) {
1205 advance(s, lexer);
1206 }
1207 } else {
1208 advance(s, lexer);
1209 }
1210 }
1211 }
1212 if (empty && cell_count == 0 && !(starting_pipe && ending_pipe)) {
1213 return false;
1214 }
1215 if (!ending_pipe) {
1216 cell_count++;
1217 }
1218
1219 // check the following line for a delimiter row
1220 // parse a newline
1221 if (lexer->lookahead == '\n') {
1222 advance(s, lexer);
1223 } else if (lexer->lookahead == '\r') {
1224 advance(s, lexer);
1225 if (lexer->lookahead == '\n') {
1226 advance(s, lexer);
1227 }
1228 } else {
1229 return false;
1230 }
1231 s->indentation = 0;
1232 s->column = 0;
1233 for (;;) {
1234 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1235 s->indentation += advance(s, lexer);
1236 } else {
1237 break;
1238 }
1239 }
1240 s->simulate = true;
1241 uint8_t matched_temp = 0;
1242 while (matched_temp < (uint8_t)s->open_blocks.size) {
1243 if (match(s, lexer, s->open_blocks.items[matched_temp])) {
1244 matched_temp++;
1245 } else {
1246 return false;
1247 }
1248 }
1249
1250 // check if delimiter row has the same number of cells and at least one pipe
1251 size_t delimiter_cell_count = 0;
1252 if (lexer->lookahead == '|') {
1253 advance(s, lexer);
1254 }
1255 for (;;) {
1256 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1257 advance(s, lexer);
1258 }
1259 if (lexer->lookahead == '|') {
1260 delimiter_cell_count++;
1261 advance(s, lexer);
1262 continue;
1263 }
1264 if (lexer->lookahead == ':') {
1265 advance(s, lexer);
1266 if (lexer->lookahead != '-') {
1267 return false;
1268 }
1269 }
1270 bool had_one_minus = false;
1271 while (lexer->lookahead == '-') {
1272 had_one_minus = true;
1273 advance(s, lexer);
1274 }
1275 if (had_one_minus) {
1276 delimiter_cell_count++;
1277 }
1278 if (lexer->lookahead == ':') {
1279 if (!had_one_minus) {
1280 return false;
1281 }
1282 advance(s, lexer);
1283 }
1284 while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1285 advance(s, lexer);
1286 }
1287 if (lexer->lookahead == '|') {
1288 if (!had_one_minus) {
1289 delimiter_cell_count++;
1290 }
1291 advance(s, lexer);
1292 continue;
1293 }
1294 if (lexer->lookahead != '\r' && lexer->lookahead != '\n') {
1295 return false;
1296 } else {
1297 break;
1298 }
1299 }
1300 // if the cell counts are not equal then this is not a table
1301 if (cell_count != delimiter_cell_count) {
1302 return false;
1303 }
1304
1305 lexer->result_symbol = PIPE_TABLE_START;
1306 return true;
1307}
1308
1309static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) {
1310 // A normal tree-sitter rule decided that the current branch is invalid and
1311 // now "requests" an error to stop the branch
1312 if (valid_symbols[TRIGGER_ERROR]) {
1313 return error(lexer);
1314 }
1315
1316 // Close the inner most block after the next line break as requested. See
1317 // `$._close_block` in grammar.js
1318 if (valid_symbols[CLOSE_BLOCK]) {
1319 s->state |= STATE_CLOSE_BLOCK;
1320 lexer->result_symbol = CLOSE_BLOCK;
1321 return true;
1322 }
1323
1324 // if we are at the end of the file and there are still open blocks close
1325 // them all
1326 if (lexer->eof(lexer)) {
1327 if (valid_symbols[TOKEN_EOF]) {
1328 lexer->result_symbol = TOKEN_EOF;
1329 return true;
1330 }
1331 if (s->open_blocks.size > 0) {
1332 lexer->result_symbol = BLOCK_CLOSE;
1333 if (!s->simulate)
1334 pop_block(s);
1335 return true;
1336 }
1337 return false;
1338 }
1339
1340 if (!(s->state & STATE_MATCHING)) {
1341 // Parse any preceeding whitespace and remember its length. This makes a
1342 // lot of parsing quite a bit easier.
1343 for (;;) {
1344 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1345 s->indentation += advance(s, lexer);
1346 } else {
1347 break;
1348 }
1349 }
1350 // We are not matching. This is where the parsing logic for most
1351 // "normal" token is. Most importantly parsing logic for the start of
1352 // new blocks.
1353 if (valid_symbols[INDENTED_CHUNK_START] &&
1354 !valid_symbols[NO_INDENTED_CHUNK]) {
1355 if (s->indentation >= 4 && lexer->lookahead != '\n' &&
1356 lexer->lookahead != '\r') {
1357 lexer->result_symbol = INDENTED_CHUNK_START;
1358 if (!s->simulate)
1359 push_block(s, INDENTED_CODE_BLOCK);
1360 s->indentation -= 4;
1361 return true;
1362 }
1363 }
1364 // Decide which tokens to consider based on the first non-whitespace
1365 // character
1366 switch (lexer->lookahead) {
1367 case '\r':
1368 case '\n':
1369 if (valid_symbols[BLANK_LINE_START]) {
1370 // A blank line token is actually just 0 width, so do not
1371 // consume the characters
1372 lexer->result_symbol = BLANK_LINE_START;
1373 return true;
1374 }
1375 break;
1376 case '`':
1377 // A backtick could mark the beginning or ending of a fenced
1378 // code block.
1379 return parse_fenced_code_block(s, '`', lexer, valid_symbols);
1380 case '~':
1381 // A tilde could mark the beginning or ending of a fenced code
1382 // block.
1383 return parse_fenced_code_block(s, '~', lexer, valid_symbols);
1384 case '*':
1385 // A star could either mark a list item or a thematic break.
1386 // This code is similar to the code for '_' and '+'.
1387 return parse_star(s, lexer, valid_symbols);
1388 case '_':
1389 return parse_thematic_break_underscore(s, lexer, valid_symbols);
1390 case '>':
1391 // A '>' could mark the beginning of a block quote
1392 return parse_block_quote(s, lexer, valid_symbols);
1393 case '#':
1394 // A '#' could mark a atx heading
1395 return parse_atx_heading(s, lexer, valid_symbols);
1396 case '=':
1397 // A '=' could mark a setext underline
1398 return parse_setext_underline(s, lexer, valid_symbols);
1399 case '+':
1400 // A '+' could be a list marker
1401 return parse_plus(s, lexer, valid_symbols);
1402 case '0':
1403 case '1':
1404 case '2':
1405 case '3':
1406 case '4':
1407 case '5':
1408 case '6':
1409 case '7':
1410 case '8':
1411 case '9':
1412 // A number could be a list marker (if followed by a dot or a
1413 // parenthesis)
1414 return parse_ordered_list_marker(s, lexer, valid_symbols);
1415 case '-':
1416 // A minus could mark a list marker, a thematic break or a
1417 // setext underline
1418 return parse_minus(s, lexer, valid_symbols);
1419 case '<':
1420 // A < could mark the beginning of a html block
1421 return parse_html_block(s, lexer, valid_symbols);
1422 }
1423 if (lexer->lookahead != '\r' && lexer->lookahead != '\n' &&
1424 valid_symbols[PIPE_TABLE_START]) {
1425 return parse_pipe_table(s, lexer, valid_symbols);
1426 }
1427 } else { // we are in the state of trying to match all currently open blocks
1428 bool partial_success = false;
1429 while (s->matched < (uint8_t)s->open_blocks.size) {
1430 if (s->matched == (uint8_t)s->open_blocks.size - 1 &&
1431 (s->state & STATE_CLOSE_BLOCK)) {
1432 if (!partial_success)
1433 s->state &= ~STATE_CLOSE_BLOCK;
1434 break;
1435 }
1436 if (match(s, lexer, s->open_blocks.items[s->matched])) {
1437 partial_success = true;
1438 s->matched++;
1439 } else {
1440 if (s->state & STATE_WAS_SOFT_LINE_BREAK) {
1441 s->state &= (~STATE_MATCHING);
1442 }
1443 break;
1444 }
1445 }
1446 if (partial_success) {
1447 if (s->matched == s->open_blocks.size) {
1448 s->state &= (~STATE_MATCHING);
1449 }
1450 lexer->result_symbol = BLOCK_CONTINUATION;
1451 return true;
1452 }
1453
1454 if (!(s->state & STATE_WAS_SOFT_LINE_BREAK)) {
1455 lexer->result_symbol = BLOCK_CLOSE;
1456 pop_block(s);
1457 if (s->matched == s->open_blocks.size) {
1458 s->state &= (~STATE_MATCHING);
1459 }
1460 return true;
1461 }
1462 }
1463
1464 // The parser just encountered a line break. Setup the state correspondingly
1465 if ((valid_symbols[LINE_ENDING] || valid_symbols[SOFT_LINE_ENDING] ||
1466 valid_symbols[PIPE_TABLE_LINE_ENDING]) &&
1467 (lexer->lookahead == '\n' || lexer->lookahead == '\r')) {
1468 if (lexer->lookahead == '\r') {
1469 advance(s, lexer);
1470 if (lexer->lookahead == '\n') {
1471 advance(s, lexer);
1472 }
1473 } else {
1474 advance(s, lexer);
1475 }
1476 s->indentation = 0;
1477 s->column = 0;
1478 if (!(s->state & STATE_CLOSE_BLOCK) &&
1479 (valid_symbols[SOFT_LINE_ENDING] ||
1480 valid_symbols[PIPE_TABLE_LINE_ENDING])) {
1481 lexer->mark_end(lexer);
1482 for (;;) {
1483 if (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
1484 s->indentation += advance(s, lexer);
1485 } else {
1486 break;
1487 }
1488 }
1489 s->simulate = true;
1490 uint8_t matched_temp = s->matched;
1491 s->matched = 0;
1492 bool one_will_be_matched = false;
1493 while (s->matched < (uint8_t)s->open_blocks.size) {
1494 if (match(s, lexer, s->open_blocks.items[s->matched])) {
1495 s->matched++;
1496 one_will_be_matched = true;
1497 } else {
1498 break;
1499 }
1500 }
1501 bool all_will_be_matched = s->matched == s->open_blocks.size;
1502 if (!lexer->eof(lexer) &&
1503 !scan(s, lexer, paragraph_interrupt_symbols)) {
1504 s->matched = matched_temp;
1505 // If the last line break ended a paragraph and no new block
1506 // opened, the last line break should have been a soft line
1507 // break Reset the counter for matched blocks
1508 s->matched = 0;
1509 s->indentation = 0;
1510 s->column = 0;
1511 // If there is at least one open block, we should be in the
1512 // matching state. Also set the matching flag if a
1513 // `$._soft_line_break_marker` can be emitted so it does get
1514 // emitted.
1515 if (one_will_be_matched) {
1516 s->state |= STATE_MATCHING;
1517 } else {
1518 s->state &= (~STATE_MATCHING);
1519 }
1520 if (valid_symbols[PIPE_TABLE_LINE_ENDING]) {
1521 if (all_will_be_matched) {
1522 lexer->result_symbol = PIPE_TABLE_LINE_ENDING;
1523 return true;
1524 }
1525 } else {
1526 lexer->result_symbol = SOFT_LINE_ENDING;
1527 // reset some state variables
1528 s->state |= STATE_WAS_SOFT_LINE_BREAK;
1529 return true;
1530 }
1531 } else {
1532 s->matched = matched_temp;
1533 }
1534 s->indentation = 0;
1535 s->column = 0;
1536 }
1537 if (valid_symbols[LINE_ENDING]) {
1538 // If the last line break ended a paragraph and no new block opened,
1539 // the last line break should have been a soft line break Reset the
1540 // counter for matched blocks
1541 s->matched = 0;
1542 // If there is at least one open block, we should be in the matching
1543 // state. Also set the matching flag if a
1544 // `$._soft_line_break_marker` can be emitted so it does get
1545 // emitted.
1546 if (s->open_blocks.size > 0) {
1547 s->state |= STATE_MATCHING;
1548 } else {
1549 s->state &= (~STATE_MATCHING);
1550 }
1551 // reset some state variables
1552 s->state &= (~STATE_WAS_SOFT_LINE_BREAK);
1553 lexer->result_symbol = LINE_ENDING;
1554 return true;
1555 }
1556 }
1557 return false;
1558}
1559
1560void *tree_sitter_markdown_external_scanner_create(void) {
1561 Scanner *s = (Scanner *)malloc(sizeof(Scanner));
1562 s->open_blocks.items = (Block *)calloc(1, sizeof(Block));
1563#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
1564 _Static_assert(ATX_H6_MARKER == ATX_H1_MARKER + 5, "");
1565#else
1566 assert(ATX_H6_MARKER == ATX_H1_MARKER + 5);
1567#endif
1568 deserialize(s, NULL, 0);
1569
1570 return s;
1571}
1572
1573bool tree_sitter_markdown_external_scanner_scan(void *payload, TSLexer *lexer,
1574 const bool *valid_symbols) {
1575 Scanner *scanner = (Scanner *)payload;
1576 scanner->simulate = false;
1577 return scan(scanner, lexer, valid_symbols);
1578}
1579
1580unsigned tree_sitter_markdown_external_scanner_serialize(void *payload,
1581 char *buffer) {
1582 Scanner *scanner = (Scanner *)payload;
1583 return serialize(scanner, buffer);
1584}
1585
1586void tree_sitter_markdown_external_scanner_deserialize(void *payload,
1587 char *buffer,
1588 unsigned length) {
1589 Scanner *scanner = (Scanner *)payload;
1590 deserialize(scanner, buffer, length);
1591}
1592
1593void tree_sitter_markdown_external_scanner_destroy(void *payload) {
1594 Scanner *scanner = (Scanner *)payload;
1595 free(scanner->open_blocks.items);
1596 free(scanner);
1597}