diff options
Diffstat (limited to 'vendor/tree-sitter-php/src/common/scanner.h')
| -rw-r--r-- | vendor/tree-sitter-php/src/common/scanner.h | 543 |
1 files changed, 543 insertions, 0 deletions
diff --git a/vendor/tree-sitter-php/src/common/scanner.h b/vendor/tree-sitter-php/src/common/scanner.h new file mode 100644 index 0000000..e16a21e --- /dev/null +++ b/vendor/tree-sitter-php/src/common/scanner.h @@ -0,0 +1,543 @@ +#include "tree_sitter/array.h" +#include "tree_sitter/parser.h" + +#include <string.h> +#include <wchar.h> +#include <wctype.h> + +enum TokenType { + AUTOMATIC_SEMICOLON, + ENCAPSED_STRING_CHARS, + ENCAPSED_STRING_CHARS_AFTER_VARIABLE, + EXECUTION_STRING_CHARS, + EXECUTION_STRING_CHARS_AFTER_VARIABLE, + ENCAPSED_STRING_CHARS_HEREDOC, + ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC, + EOF_TOKEN, + HEREDOC_START, + HEREDOC_END, + NOWDOC_STRING, + SENTINEL_ERROR, // Unused token used to indicate error recovery mode +}; + +typedef Array(int32_t) String; + +static inline bool string_eq(String *self, String *other) { + if (self->size != other->size) { + return false; + } + if (self->size == 0) { + return self->size == other->size; + } + return memcmp(self->contents, other->contents, self->size * sizeof(self->contents[0])) == 0; +} + +typedef struct { + bool end_word_indentation_allowed; + String word; +} Heredoc; + +#define heredoc_new() \ + { \ + .end_word_indentation_allowed = false, \ + .word = array_new(), \ + }; + +typedef struct { + bool has_leading_whitespace; + Array(Heredoc) heredocs; +} Scanner; + +typedef enum { Error, End } ScanContentResult; + +static inline void reset_heredoc(Heredoc *heredoc) { + array_delete(&heredoc->word); + heredoc->end_word_indentation_allowed = false; +} + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static unsigned serialize(Scanner *scanner, char *buffer) { + unsigned size = 0; + + buffer[size++] = (char)scanner->heredocs.size; + for (unsigned j = 0; j < scanner->heredocs.size; j++) { + Heredoc *heredoc = &scanner->heredocs.contents[j]; + unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]); + if (size + 5 + word_size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) { + return 0; + } + buffer[size++] = (char)heredoc->end_word_indentation_allowed; + memcpy(&buffer[size], &heredoc->word.size, sizeof(uint32_t)); + size += sizeof(uint32_t); + if (heredoc->word.size > 0) { + memcpy(&buffer[size], heredoc->word.contents, word_size); + size += word_size; + } + } + + return size; +} + +static void deserialize(Scanner *scanner, const char *buffer, unsigned length) { + unsigned size = 0; + scanner->has_leading_whitespace = false; + + for (uint32_t i = 0; i < scanner->heredocs.size; i++) { + reset_heredoc(array_get(&scanner->heredocs, i)); + } + + if (length == 0) { + return; + } + + uint8_t open_heredoc_count = buffer[size++]; + for (unsigned i = 0; i < open_heredoc_count; i++) { + Heredoc *heredoc = NULL; + if (i < scanner->heredocs.size) { + heredoc = array_get(&scanner->heredocs, i); + } else { + Heredoc new_heredoc = heredoc_new(); + array_push(&scanner->heredocs, new_heredoc); + heredoc = array_back(&scanner->heredocs); + } + + heredoc->end_word_indentation_allowed = buffer[size++]; + memcpy(&heredoc->word.size, &buffer[size], sizeof(uint32_t)); + size += sizeof(uint32_t); + unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]); + if (word_size > 0) { + array_reserve(&heredoc->word, heredoc->word.size); + memcpy(heredoc->word.contents, &buffer[size], word_size); + size += word_size; + } + } + + assert(size == length); +} + +static inline bool scan_whitespace(TSLexer *lexer) { + for (;;) { + while (iswspace(lexer->lookahead)) { + advance(lexer); + } + + if (lexer->lookahead == '/') { + advance(lexer); + + if (lexer->lookahead == '/') { + advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\n') { + advance(lexer); + } + } else { + return false; + } + } else { + return true; + } + } +} + +static inline bool is_valid_name_char(TSLexer *lexer) { + return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80; +} + +static inline bool is_escapable_sequence(TSLexer *lexer) { + // Note: remember to also update the escape_sequence rule in the + // main grammar whenever changing this method + int32_t letter = lexer->lookahead; + + if (letter == 'n' || letter == 'r' || letter == 't' || letter == 'v' || letter == 'e' || letter == 'f' || + letter == '\\' || letter == '$' || letter == '"') { + return true; + } + + // Hex + if (letter == 'x') { + advance(lexer); + return iswxdigit(lexer->lookahead); + } + + // Unicode + if (letter == 'u') { + return true; // We handle the case where this is not really an escape + // sequence in grammar.js - this is needed to support the + // edge case "\u{$a}" in which case "\u" is to be + // interpreted as characters and {$a} as a variable + } + + // Octal + return iswdigit(lexer->lookahead) && lexer->lookahead >= '0' && lexer->lookahead <= '7'; +} + +static String scan_heredoc_word(TSLexer *lexer) { + String result = (String)array_new(); + + while (is_valid_name_char(lexer)) { + array_push(&result, lexer->lookahead); + advance(lexer); + } + + return result; +} + +static inline bool scan_nowdoc_string(Scanner *scanner, TSLexer *lexer) { + bool has_consumed_content = false; + if (scanner->heredocs.size == 0) { + return false; + } + + // While PHP requires the nowdoc end tag to be the very first on a new line, + // there may be an arbitrary amount of whitespace before the closing token + while (iswspace(lexer->lookahead)) { + advance(lexer); + has_consumed_content = true; + } + + bool end_tag_matched = false; + String heredoc_tag = array_back(&scanner->heredocs)->word; + + for (uint32_t i = 0; i < heredoc_tag.size; i++) { + if (lexer->lookahead != heredoc_tag.contents[i]) { + break; + } + advance(lexer); + has_consumed_content = true; + + end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' || + lexer->lookahead == ',' || lexer->lookahead == ')')); + } + + if (end_tag_matched) { + // There may be an arbitrary amount of white space after the end tag + while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') { + advance(lexer); + has_consumed_content = true; + } + + // Return to allow the end tag parsing if we've encountered an end tag + // at a valid position + if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == '\n' || + lexer->lookahead == '\r') { + // , and ) is needed to support heredoc in function arguments + return false; + } + } + + for (bool has_content = has_consumed_content;; has_content = true) { + lexer->mark_end(lexer); + + switch (lexer->lookahead) { + case '\n': + case '\r': + return has_content; + default: + if (lexer->eof(lexer)) { + return false; + } + advance(lexer); + } + } + + return false; +} + +static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_after_variable, bool is_heredoc, + bool is_execution_string) { + bool has_consumed_content = false; + + if (is_heredoc && scanner->heredocs.size > 0) { + // While PHP requires the heredoc end tag to be the very first on a new + // line, there may be an arbitrary amount of whitespace before the + // closing token However, we should not consume \r or \n + while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') { + advance(lexer); + has_consumed_content = true; + } + + String heredoc_tag = array_back(&scanner->heredocs)->word; + + bool end_tag_matched = false; + + for (uint32_t i = 0; i < heredoc_tag.size; i++) { + if (lexer->lookahead != heredoc_tag.contents[i]) { + break; + } + has_consumed_content = true; + advance(lexer); + + end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' || + lexer->lookahead == ',' || lexer->lookahead == ')')); + } + + if (end_tag_matched) { + // There may be an arbitrary amount of white space after the end tag + // However, we should not consume \r or \n + while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') { + advance(lexer); + has_consumed_content = true; + } + + // Return to allow the end tag parsing if we've encountered an end + // tag at a valid position + if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || + lexer->lookahead == '\n' || lexer->lookahead == '\r') { + // , and ) is needed to support heredoc in function arguments + return false; + } + } + } + + for (bool has_content = has_consumed_content;; has_content = true) { + lexer->mark_end(lexer); + + switch (lexer->lookahead) { + case '"': + if (!is_heredoc && !is_execution_string) { + return has_content; + } + advance(lexer); + break; + case '`': + if (is_execution_string) { + return has_content; + } + advance(lexer); + break; + case '\n': + case '\r': + if (is_heredoc) { + return has_content; + } + advance(lexer); + break; + case '\\': + advance(lexer); + + // \{ should not be interpreted as an escape sequence, but both + // should be consumed as normal characters + if (lexer->lookahead == '{') { + advance(lexer); + break; + } + + if (is_execution_string && lexer->lookahead == '`') { + return has_content; + } + + if (is_heredoc && lexer->lookahead == '\\') { + advance(lexer); + break; + } + + if (is_escapable_sequence(lexer)) { + return has_content; + } + break; + case '$': + advance(lexer); + + if ((is_valid_name_char(lexer) && !iswdigit(lexer->lookahead)) || lexer->lookahead == '{') { + return has_content; + } + break; + case '-': + if (is_after_variable) { + advance(lexer); + if (lexer->lookahead == '>') { + advance(lexer); + if (is_valid_name_char(lexer)) { + return has_content; + } + break; + } + break; + } + case '[': + if (is_after_variable) { + return has_content; + } + advance(lexer); + break; + case '{': + advance(lexer); + if (lexer->lookahead == '$') { + return has_content; + } + break; + default: + if (lexer->eof(lexer)) { + return false; + } + advance(lexer); + } + + is_after_variable = false; + } + + return false; +} + +static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + const bool is_error_recovery = valid_symbols[SENTINEL_ERROR]; + + if (is_error_recovery) { + return false; + } + + scanner->has_leading_whitespace = false; + + lexer->mark_end(lexer); + + if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE]) { + lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE; + return scan_encapsed_part_string(scanner, lexer, + /* is_after_variable */ true, + /* is_heredoc */ false, + /* is_execution_string */ false); + } + + if (valid_symbols[ENCAPSED_STRING_CHARS]) { + lexer->result_symbol = ENCAPSED_STRING_CHARS; + return scan_encapsed_part_string(scanner, lexer, + /* is_after_variable */ false, + /* is_heredoc */ false, + /* is_execution_string */ false); + } + + if (valid_symbols[EXECUTION_STRING_CHARS_AFTER_VARIABLE]) { + lexer->result_symbol = EXECUTION_STRING_CHARS_AFTER_VARIABLE; + return scan_encapsed_part_string(scanner, lexer, + /* is_after_variable */ true, + /* is_heredoc */ false, + /* is_execution_string */ true); + } + + if (valid_symbols[EXECUTION_STRING_CHARS]) { + lexer->result_symbol = EXECUTION_STRING_CHARS; + return scan_encapsed_part_string(scanner, lexer, + /* is_after_variable */ false, + /* is_heredoc */ false, + /* is_execution_string */ true); + } + + if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC]) { + lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC; + return scan_encapsed_part_string(scanner, lexer, + /* is_after_variable */ true, + /* is_heredoc */ true, + /* is_execution_string */ false); + } + + if (valid_symbols[ENCAPSED_STRING_CHARS_HEREDOC]) { + lexer->result_symbol = ENCAPSED_STRING_CHARS_HEREDOC; + return scan_encapsed_part_string(scanner, lexer, + /* is_after_variable */ false, + /* is_heredoc */ true, + /* is_execution_string */ false); + } + + if (valid_symbols[NOWDOC_STRING]) { + lexer->result_symbol = NOWDOC_STRING; + return scan_nowdoc_string(scanner, lexer); + } + + if (valid_symbols[HEREDOC_END]) { + lexer->result_symbol = HEREDOC_END; + if (scanner->heredocs.size == 0) { + return false; + } + + Heredoc heredoc = *array_back(&scanner->heredocs); + + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + + String word = scan_heredoc_word(lexer); + if (!string_eq(&word, &heredoc.word)) { + array_delete(&word); + return false; + } + array_delete(&word); + + lexer->mark_end(lexer); + array_delete(&array_pop(&scanner->heredocs).word); + return true; + } + + if (!scan_whitespace(lexer)) { + return false; + } + + if (valid_symbols[EOF_TOKEN] && lexer->eof(lexer)) { + lexer->result_symbol = EOF_TOKEN; + return true; + } + + if (valid_symbols[HEREDOC_START]) { + lexer->result_symbol = HEREDOC_START; + Heredoc heredoc = heredoc_new(); + + while (iswspace(lexer->lookahead)) { + skip(lexer); + } + + heredoc.word = scan_heredoc_word(lexer); + if (heredoc.word.size == 0) { + array_delete(&heredoc.word); + return false; + } + lexer->mark_end(lexer); + + array_push(&scanner->heredocs, heredoc); + return true; + } + + if (valid_symbols[AUTOMATIC_SEMICOLON]) { + lexer->result_symbol = AUTOMATIC_SEMICOLON; + + if (lexer->lookahead != '?') { + return false; + } + + advance(lexer); + + return lexer->lookahead == '>'; + } + + return false; +} + +static inline void *external_scanner_create() { + Scanner *scanner = ts_calloc(1, sizeof(Scanner)); + array_init(&scanner->heredocs); + return scanner; +} + +static inline unsigned external_scanner_serialize(void *payload, char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +static inline void external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} + +static inline void external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + for (size_t i = 0; i < scanner->heredocs.size; i++) { + array_delete(&scanner->heredocs.contents[i].word); + } + array_delete(&scanner->heredocs); + ts_free(scanner); +} |
