diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2023-11-07 17:00:42 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2023-11-07 17:00:42 +0100 |
| commit | af9983cd6f0535b1088ac130089897bbb86f81e5 (patch) | |
| tree | e980b665395728d3f267b43f42de3eadc88f84a2 /vendor/tree-sitter-python/src/scanner.c | |
| parent | ac00e009ae5816dcea32ed131d72e812ec09c4b9 (diff) | |
| download | crep-af9983cd6f0535b1088ac130089897bbb86f81e5.tar.gz | |
Added tree-sitter-python vendor library
Diffstat (limited to 'vendor/tree-sitter-python/src/scanner.c')
| -rw-r--r-- | vendor/tree-sitter-python/src/scanner.c | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/vendor/tree-sitter-python/src/scanner.c b/vendor/tree-sitter-python/src/scanner.c new file mode 100644 index 0000000..44058d9 --- /dev/null +++ b/vendor/tree-sitter-python/src/scanner.c @@ -0,0 +1,528 @@ +#include "tree_sitter/parser.h" + +#include <assert.h> +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +#define VEC_RESIZE(vec, _cap) \ + void *tmp = realloc((vec).data, (_cap) * sizeof((vec).data[0])); \ + assert(tmp != NULL); \ + (vec).data = tmp; \ + (vec).cap = (_cap); + +#define VEC_GROW(vec, _cap) \ + if ((vec).cap < (_cap)) { \ + VEC_RESIZE((vec), (_cap)); \ + } + +#define VEC_PUSH(vec, el) \ + if ((vec).cap == (vec).len) { \ + VEC_RESIZE((vec), MAX(16, (vec).len * 2)); \ + } \ + (vec).data[(vec).len++] = (el); + +#define VEC_POP(vec) (vec).len--; + +#define VEC_NEW \ + { .len = 0, .cap = 0, .data = NULL } + +#define VEC_BACK(vec) ((vec).data[(vec).len - 1]) + +#define VEC_FREE(vec) \ + { \ + if ((vec).data != NULL) \ + free((vec).data); \ + } + +#define VEC_CLEAR(vec) (vec).len = 0; + +enum TokenType { + NEWLINE, + INDENT, + DEDENT, + STRING_START, + STRING_CONTENT, + ESCAPE_INTERPOLATION, + STRING_END, + COMMENT, + CLOSE_PAREN, + CLOSE_BRACKET, + CLOSE_BRACE, + EXCEPT, +}; + +typedef enum { + SingleQuote = 1 << 0, + DoubleQuote = 1 << 1, + BackQuote = 1 << 2, + Raw = 1 << 3, + Format = 1 << 4, + Triple = 1 << 5, + Bytes = 1 << 6, +} Flags; + +typedef struct { + char flags; +} Delimiter; + +static inline Delimiter new_delimiter() { return (Delimiter){0}; } + +static inline bool is_format(Delimiter *delimiter) { + return delimiter->flags & Format; +} + +static inline bool is_raw(Delimiter *delimiter) { + return delimiter->flags & Raw; +} + +static inline bool is_triple(Delimiter *delimiter) { + return delimiter->flags & Triple; +} + +static inline bool is_bytes(Delimiter *delimiter) { + return delimiter->flags & Bytes; +} + +static inline int32_t end_character(Delimiter *delimiter) { + if (delimiter->flags & SingleQuote) { + return '\''; + } + if (delimiter->flags & DoubleQuote) { + return '"'; + } + if (delimiter->flags & BackQuote) { + return '`'; + } + return 0; +} + +static inline void set_format(Delimiter *delimiter) { + delimiter->flags |= Format; +} + +static inline void set_raw(Delimiter *delimiter) { delimiter->flags |= Raw; } + +static inline void set_triple(Delimiter *delimiter) { + delimiter->flags |= Triple; +} + +static inline void set_bytes(Delimiter *delimiter) { + delimiter->flags |= Bytes; +} + +static inline void set_end_character(Delimiter *delimiter, int32_t character) { + switch (character) { + case '\'': + delimiter->flags |= SingleQuote; + break; + case '"': + delimiter->flags |= DoubleQuote; + break; + case '`': + delimiter->flags |= BackQuote; + break; + default: + assert(false); + } +} + +typedef struct { + uint32_t len; + uint32_t cap; + uint16_t *data; +} indent_vec; + +static indent_vec indent_vec_new() { + indent_vec vec = VEC_NEW; + vec.data = calloc(1, sizeof(uint16_t)); + vec.cap = 1; + return vec; +} + +typedef struct { + uint32_t len; + uint32_t cap; + Delimiter *data; +} delimiter_vec; + +static delimiter_vec delimiter_vec_new() { + delimiter_vec vec = VEC_NEW; + vec.data = calloc(1, sizeof(Delimiter)); + vec.cap = 1; + return vec; +} + +typedef struct { + indent_vec indents; + delimiter_vec delimiters; + bool inside_f_string; +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +bool tree_sitter_python_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + + bool error_recovery_mode = + valid_symbols[STRING_CONTENT] && valid_symbols[INDENT]; + bool within_brackets = valid_symbols[CLOSE_BRACE] || + valid_symbols[CLOSE_PAREN] || + valid_symbols[CLOSE_BRACKET]; + + bool advanced_once = false; + if (valid_symbols[ESCAPE_INTERPOLATION] && scanner->delimiters.len > 0 && + (lexer->lookahead == '{' || lexer->lookahead == '}') && + !error_recovery_mode) { + Delimiter delimiter = VEC_BACK(scanner->delimiters); + if (is_format(&delimiter)) { + lexer->mark_end(lexer); + bool is_left_brace = lexer->lookahead == '{'; + advance(lexer); + advanced_once = true; + if ((lexer->lookahead == '{' && is_left_brace) || + (lexer->lookahead == '}' && !is_left_brace)) { + advance(lexer); + lexer->mark_end(lexer); + lexer->result_symbol = ESCAPE_INTERPOLATION; + return true; + } + return false; + } + } + + if (valid_symbols[STRING_CONTENT] && scanner->delimiters.len > 0 && + !error_recovery_mode) { + Delimiter delimiter = VEC_BACK(scanner->delimiters); + int32_t end_char = end_character(&delimiter); + bool has_content = advanced_once; + while (lexer->lookahead) { + if ((advanced_once || lexer->lookahead == '{' || + lexer->lookahead == '}') && + is_format(&delimiter)) { + lexer->mark_end(lexer); + lexer->result_symbol = STRING_CONTENT; + return has_content; + } + if (lexer->lookahead == '\\') { + if (is_raw(&delimiter)) { + // Step over the backslash. + advance(lexer); + // Step over any escaped quotes. + if (lexer->lookahead == end_character(&delimiter) || + lexer->lookahead == '\\') { + advance(lexer); + } + // Step over newlines + if (lexer->lookahead == '\r') { + advance(lexer); + if (lexer->lookahead == '\n') { + advance(lexer); + } + } else if (lexer->lookahead == '\n') { + advance(lexer); + } + continue; + } + if (is_bytes(&delimiter)) { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == 'N' || lexer->lookahead == 'u' || + lexer->lookahead == 'U') { + // In bytes string, \N{...}, \uXXXX and \UXXXXXXXX are + // not escape sequences + // https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals + advance(lexer); + } else { + lexer->result_symbol = STRING_CONTENT; + return has_content; + } + } else { + lexer->mark_end(lexer); + lexer->result_symbol = STRING_CONTENT; + return has_content; + } + } else if (lexer->lookahead == end_char) { + if (is_triple(&delimiter)) { + lexer->mark_end(lexer); + advance(lexer); + if (lexer->lookahead == end_char) { + advance(lexer); + if (lexer->lookahead == end_char) { + if (has_content) { + lexer->result_symbol = STRING_CONTENT; + } else { + advance(lexer); + lexer->mark_end(lexer); + VEC_POP(scanner->delimiters); + lexer->result_symbol = STRING_END; + scanner->inside_f_string = false; + } + return true; + } + lexer->mark_end(lexer); + lexer->result_symbol = STRING_CONTENT; + return true; + } + lexer->mark_end(lexer); + lexer->result_symbol = STRING_CONTENT; + return true; + } + if (has_content) { + lexer->result_symbol = STRING_CONTENT; + } else { + advance(lexer); + VEC_POP(scanner->delimiters); + lexer->result_symbol = STRING_END; + scanner->inside_f_string = false; + } + lexer->mark_end(lexer); + return true; + + } else if (lexer->lookahead == '\n' && has_content && + !is_triple(&delimiter)) { + return false; + } + advance(lexer); + has_content = true; + } + } + + lexer->mark_end(lexer); + + bool found_end_of_line = false; + uint32_t indent_length = 0; + int32_t first_comment_indent_length = -1; + for (;;) { + if (lexer->lookahead == '\n') { + found_end_of_line = true; + indent_length = 0; + skip(lexer); + } else if (lexer->lookahead == ' ') { + indent_length++; + skip(lexer); + } else if (lexer->lookahead == '\r' || lexer->lookahead == '\f') { + indent_length = 0; + skip(lexer); + } else if (lexer->lookahead == '\t') { + indent_length += 8; + skip(lexer); + } else if (lexer->lookahead == '#' && + (valid_symbols[INDENT] || valid_symbols[DEDENT] || + valid_symbols[NEWLINE] || valid_symbols[EXCEPT])) { + // If we haven't found an EOL yet, + // then this is a comment after an expression: + // foo = bar # comment + // Just return, since we don't want to generate an indent/dedent + // token. + if (!found_end_of_line) { + return false; + } + if (first_comment_indent_length == -1) { + first_comment_indent_length = (int32_t)indent_length; + } + while (lexer->lookahead && lexer->lookahead != '\n') { + skip(lexer); + } + skip(lexer); + indent_length = 0; + } else if (lexer->lookahead == '\\') { + skip(lexer); + if (lexer->lookahead == '\r') { + skip(lexer); + } + if (lexer->lookahead == '\n' || lexer->eof(lexer)) { + skip(lexer); + } else { + return false; + } + } else if (lexer->eof(lexer)) { + indent_length = 0; + found_end_of_line = true; + break; + } else { + break; + } + } + + if (found_end_of_line) { + if (scanner->indents.len > 0) { + uint16_t current_indent_length = VEC_BACK(scanner->indents); + + if (valid_symbols[INDENT] && + indent_length > current_indent_length) { + VEC_PUSH(scanner->indents, indent_length); + lexer->result_symbol = INDENT; + return true; + } + + bool next_tok_is_string_start = lexer->lookahead == '\"' || + lexer->lookahead == '\'' || + lexer->lookahead == '`'; + + if ((valid_symbols[DEDENT] || + (!valid_symbols[NEWLINE] && + !(valid_symbols[STRING_START] && next_tok_is_string_start) && + !within_brackets)) && + indent_length < current_indent_length && + !scanner->inside_f_string && + + // Wait to create a dedent token until we've consumed any + // comments + // whose indentation matches the current block. + first_comment_indent_length < (int32_t)current_indent_length) { + VEC_POP(scanner->indents); + lexer->result_symbol = DEDENT; + return true; + } + } + + if (valid_symbols[NEWLINE] && !error_recovery_mode) { + lexer->result_symbol = NEWLINE; + return true; + } + } + + if (first_comment_indent_length == -1 && valid_symbols[STRING_START]) { + Delimiter delimiter = new_delimiter(); + + bool has_flags = false; + while (lexer->lookahead) { + if (lexer->lookahead == 'f' || lexer->lookahead == 'F') { + set_format(&delimiter); + } else if (lexer->lookahead == 'r' || lexer->lookahead == 'R') { + set_raw(&delimiter); + } else if (lexer->lookahead == 'b' || lexer->lookahead == 'B') { + set_bytes(&delimiter); + } else if (lexer->lookahead != 'u' && lexer->lookahead != 'U') { + break; + } + has_flags = true; + advance(lexer); + } + + if (lexer->lookahead == '`') { + set_end_character(&delimiter, '`'); + advance(lexer); + lexer->mark_end(lexer); + } else if (lexer->lookahead == '\'') { + set_end_character(&delimiter, '\''); + advance(lexer); + lexer->mark_end(lexer); + if (lexer->lookahead == '\'') { + advance(lexer); + if (lexer->lookahead == '\'') { + advance(lexer); + lexer->mark_end(lexer); + set_triple(&delimiter); + } + } + } else if (lexer->lookahead == '"') { + set_end_character(&delimiter, '"'); + advance(lexer); + lexer->mark_end(lexer); + if (lexer->lookahead == '"') { + advance(lexer); + if (lexer->lookahead == '"') { + advance(lexer); + lexer->mark_end(lexer); + set_triple(&delimiter); + } + } + } + + if (end_character(&delimiter)) { + VEC_PUSH(scanner->delimiters, delimiter); + lexer->result_symbol = STRING_START; + scanner->inside_f_string = is_format(&delimiter); + return true; + } + if (has_flags) { + return false; + } + } + + return false; +} + +unsigned tree_sitter_python_external_scanner_serialize(void *payload, + char *buffer) { + Scanner *scanner = (Scanner *)payload; + + size_t size = 0; + + buffer[size++] = (char)scanner->inside_f_string; + + size_t delimiter_count = scanner->delimiters.len; + if (delimiter_count > UINT8_MAX) { + delimiter_count = UINT8_MAX; + } + buffer[size++] = (char)delimiter_count; + + if (delimiter_count > 0) { + memcpy(&buffer[size], scanner->delimiters.data, delimiter_count); + } + size += delimiter_count; + + int iter = 1; + for (; iter < scanner->indents.len && + size < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; + ++iter) { + buffer[size++] = (char)scanner->indents.data[iter]; + } + + return size; +} + +void tree_sitter_python_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + + VEC_CLEAR(scanner->delimiters); + VEC_CLEAR(scanner->indents); + VEC_PUSH(scanner->indents, 0); + + if (length > 0) { + size_t size = 0; + + scanner->inside_f_string = (bool)buffer[size++]; + + size_t delimiter_count = (uint8_t)buffer[size++]; + if (delimiter_count > 0) { + VEC_GROW(scanner->delimiters, delimiter_count); + scanner->delimiters.len = delimiter_count; + memcpy(scanner->delimiters.data, &buffer[size], delimiter_count); + size += delimiter_count; + } + + for (; size < length; size++) { + VEC_PUSH(scanner->indents, (unsigned char)buffer[size]); + } + } +} + +void *tree_sitter_python_external_scanner_create() { +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) + _Static_assert(sizeof(Delimiter) == sizeof(char), ""); +#else + assert(sizeof(Delimiter) == sizeof(char)); +#endif + Scanner *scanner = calloc(1, sizeof(Scanner)); + scanner->indents = indent_vec_new(); + scanner->delimiters = delimiter_vec_new(); + tree_sitter_python_external_scanner_deserialize(scanner, NULL, 0); + return scanner; +} + +void tree_sitter_python_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + VEC_FREE(scanner->indents); + VEC_FREE(scanner->delimiters); + free(scanner); +} |
