summaryrefslogtreecommitdiff
path: root/vendor/tree-sitter-php/src/common/scanner.h
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/tree-sitter-php/src/common/scanner.h')
-rw-r--r--vendor/tree-sitter-php/src/common/scanner.h543
1 files changed, 543 insertions, 0 deletions
diff --git a/vendor/tree-sitter-php/src/common/scanner.h b/vendor/tree-sitter-php/src/common/scanner.h
new file mode 100644
index 0000000..e16a21e
--- /dev/null
+++ b/vendor/tree-sitter-php/src/common/scanner.h
@@ -0,0 +1,543 @@
+#include "tree_sitter/array.h"
+#include "tree_sitter/parser.h"
+
+#include <string.h>
+#include <wchar.h>
+#include <wctype.h>
+
+enum TokenType {
+ AUTOMATIC_SEMICOLON,
+ ENCAPSED_STRING_CHARS,
+ ENCAPSED_STRING_CHARS_AFTER_VARIABLE,
+ EXECUTION_STRING_CHARS,
+ EXECUTION_STRING_CHARS_AFTER_VARIABLE,
+ ENCAPSED_STRING_CHARS_HEREDOC,
+ ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC,
+ EOF_TOKEN,
+ HEREDOC_START,
+ HEREDOC_END,
+ NOWDOC_STRING,
+ SENTINEL_ERROR, // Unused token used to indicate error recovery mode
+};
+
+typedef Array(int32_t) String;
+
+static inline bool string_eq(String *self, String *other) {
+ if (self->size != other->size) {
+ return false;
+ }
+ if (self->size == 0) {
+ return self->size == other->size;
+ }
+ return memcmp(self->contents, other->contents, self->size * sizeof(self->contents[0])) == 0;
+}
+
+typedef struct {
+ bool end_word_indentation_allowed;
+ String word;
+} Heredoc;
+
+#define heredoc_new() \
+ { \
+ .end_word_indentation_allowed = false, \
+ .word = array_new(), \
+ };
+
+typedef struct {
+ bool has_leading_whitespace;
+ Array(Heredoc) heredocs;
+} Scanner;
+
+typedef enum { Error, End } ScanContentResult;
+
+static inline void reset_heredoc(Heredoc *heredoc) {
+ array_delete(&heredoc->word);
+ heredoc->end_word_indentation_allowed = false;
+}
+
+static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+
+static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
+static unsigned serialize(Scanner *scanner, char *buffer) {
+ unsigned size = 0;
+
+ buffer[size++] = (char)scanner->heredocs.size;
+ for (unsigned j = 0; j < scanner->heredocs.size; j++) {
+ Heredoc *heredoc = &scanner->heredocs.contents[j];
+ unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
+ if (size + 5 + word_size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
+ return 0;
+ }
+ buffer[size++] = (char)heredoc->end_word_indentation_allowed;
+ memcpy(&buffer[size], &heredoc->word.size, sizeof(uint32_t));
+ size += sizeof(uint32_t);
+ if (heredoc->word.size > 0) {
+ memcpy(&buffer[size], heredoc->word.contents, word_size);
+ size += word_size;
+ }
+ }
+
+ return size;
+}
+
+static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
+ unsigned size = 0;
+ scanner->has_leading_whitespace = false;
+
+ for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
+ reset_heredoc(array_get(&scanner->heredocs, i));
+ }
+
+ if (length == 0) {
+ return;
+ }
+
+ uint8_t open_heredoc_count = buffer[size++];
+ for (unsigned i = 0; i < open_heredoc_count; i++) {
+ Heredoc *heredoc = NULL;
+ if (i < scanner->heredocs.size) {
+ heredoc = array_get(&scanner->heredocs, i);
+ } else {
+ Heredoc new_heredoc = heredoc_new();
+ array_push(&scanner->heredocs, new_heredoc);
+ heredoc = array_back(&scanner->heredocs);
+ }
+
+ heredoc->end_word_indentation_allowed = buffer[size++];
+ memcpy(&heredoc->word.size, &buffer[size], sizeof(uint32_t));
+ size += sizeof(uint32_t);
+ unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
+ if (word_size > 0) {
+ array_reserve(&heredoc->word, heredoc->word.size);
+ memcpy(heredoc->word.contents, &buffer[size], word_size);
+ size += word_size;
+ }
+ }
+
+ assert(size == length);
+}
+
+static inline bool scan_whitespace(TSLexer *lexer) {
+ for (;;) {
+ while (iswspace(lexer->lookahead)) {
+ advance(lexer);
+ }
+
+ if (lexer->lookahead == '/') {
+ advance(lexer);
+
+ if (lexer->lookahead == '/') {
+ advance(lexer);
+ while (lexer->lookahead != 0 && lexer->lookahead != '\n') {
+ advance(lexer);
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return true;
+ }
+ }
+}
+
+static inline bool is_valid_name_char(TSLexer *lexer) {
+ return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80;
+}
+
+static inline bool is_escapable_sequence(TSLexer *lexer) {
+ // Note: remember to also update the escape_sequence rule in the
+ // main grammar whenever changing this method
+ int32_t letter = lexer->lookahead;
+
+ if (letter == 'n' || letter == 'r' || letter == 't' || letter == 'v' || letter == 'e' || letter == 'f' ||
+ letter == '\\' || letter == '$' || letter == '"') {
+ return true;
+ }
+
+ // Hex
+ if (letter == 'x') {
+ advance(lexer);
+ return iswxdigit(lexer->lookahead);
+ }
+
+ // Unicode
+ if (letter == 'u') {
+ return true; // We handle the case where this is not really an escape
+ // sequence in grammar.js - this is needed to support the
+ // edge case "\u{$a}" in which case "\u" is to be
+ // interpreted as characters and {$a} as a variable
+ }
+
+ // Octal
+ return iswdigit(lexer->lookahead) && lexer->lookahead >= '0' && lexer->lookahead <= '7';
+}
+
+static String scan_heredoc_word(TSLexer *lexer) {
+ String result = (String)array_new();
+
+ while (is_valid_name_char(lexer)) {
+ array_push(&result, lexer->lookahead);
+ advance(lexer);
+ }
+
+ return result;
+}
+
+static inline bool scan_nowdoc_string(Scanner *scanner, TSLexer *lexer) {
+ bool has_consumed_content = false;
+ if (scanner->heredocs.size == 0) {
+ return false;
+ }
+
+ // While PHP requires the nowdoc end tag to be the very first on a new line,
+ // there may be an arbitrary amount of whitespace before the closing token
+ while (iswspace(lexer->lookahead)) {
+ advance(lexer);
+ has_consumed_content = true;
+ }
+
+ bool end_tag_matched = false;
+ String heredoc_tag = array_back(&scanner->heredocs)->word;
+
+ for (uint32_t i = 0; i < heredoc_tag.size; i++) {
+ if (lexer->lookahead != heredoc_tag.contents[i]) {
+ break;
+ }
+ advance(lexer);
+ has_consumed_content = true;
+
+ end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
+ lexer->lookahead == ',' || lexer->lookahead == ')'));
+ }
+
+ if (end_tag_matched) {
+ // There may be an arbitrary amount of white space after the end tag
+ while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
+ advance(lexer);
+ has_consumed_content = true;
+ }
+
+ // Return to allow the end tag parsing if we've encountered an end tag
+ // at a valid position
+ if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == '\n' ||
+ lexer->lookahead == '\r') {
+ // , and ) is needed to support heredoc in function arguments
+ return false;
+ }
+ }
+
+ for (bool has_content = has_consumed_content;; has_content = true) {
+ lexer->mark_end(lexer);
+
+ switch (lexer->lookahead) {
+ case '\n':
+ case '\r':
+ return has_content;
+ default:
+ if (lexer->eof(lexer)) {
+ return false;
+ }
+ advance(lexer);
+ }
+ }
+
+ return false;
+}
+
+static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_after_variable, bool is_heredoc,
+ bool is_execution_string) {
+ bool has_consumed_content = false;
+
+ if (is_heredoc && scanner->heredocs.size > 0) {
+ // While PHP requires the heredoc end tag to be the very first on a new
+ // line, there may be an arbitrary amount of whitespace before the
+ // closing token However, we should not consume \r or \n
+ while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
+ advance(lexer);
+ has_consumed_content = true;
+ }
+
+ String heredoc_tag = array_back(&scanner->heredocs)->word;
+
+ bool end_tag_matched = false;
+
+ for (uint32_t i = 0; i < heredoc_tag.size; i++) {
+ if (lexer->lookahead != heredoc_tag.contents[i]) {
+ break;
+ }
+ has_consumed_content = true;
+ advance(lexer);
+
+ end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
+ lexer->lookahead == ',' || lexer->lookahead == ')'));
+ }
+
+ if (end_tag_matched) {
+ // There may be an arbitrary amount of white space after the end tag
+ // However, we should not consume \r or \n
+ while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
+ advance(lexer);
+ has_consumed_content = true;
+ }
+
+ // Return to allow the end tag parsing if we've encountered an end
+ // tag at a valid position
+ if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' ||
+ lexer->lookahead == '\n' || lexer->lookahead == '\r') {
+ // , and ) is needed to support heredoc in function arguments
+ return false;
+ }
+ }
+ }
+
+ for (bool has_content = has_consumed_content;; has_content = true) {
+ lexer->mark_end(lexer);
+
+ switch (lexer->lookahead) {
+ case '"':
+ if (!is_heredoc && !is_execution_string) {
+ return has_content;
+ }
+ advance(lexer);
+ break;
+ case '`':
+ if (is_execution_string) {
+ return has_content;
+ }
+ advance(lexer);
+ break;
+ case '\n':
+ case '\r':
+ if (is_heredoc) {
+ return has_content;
+ }
+ advance(lexer);
+ break;
+ case '\\':
+ advance(lexer);
+
+ // \{ should not be interpreted as an escape sequence, but both
+ // should be consumed as normal characters
+ if (lexer->lookahead == '{') {
+ advance(lexer);
+ break;
+ }
+
+ if (is_execution_string && lexer->lookahead == '`') {
+ return has_content;
+ }
+
+ if (is_heredoc && lexer->lookahead == '\\') {
+ advance(lexer);
+ break;
+ }
+
+ if (is_escapable_sequence(lexer)) {
+ return has_content;
+ }
+ break;
+ case '$':
+ advance(lexer);
+
+ if ((is_valid_name_char(lexer) && !iswdigit(lexer->lookahead)) || lexer->lookahead == '{') {
+ return has_content;
+ }
+ break;
+ case '-':
+ if (is_after_variable) {
+ advance(lexer);
+ if (lexer->lookahead == '>') {
+ advance(lexer);
+ if (is_valid_name_char(lexer)) {
+ return has_content;
+ }
+ break;
+ }
+ break;
+ }
+ case '[':
+ if (is_after_variable) {
+ return has_content;
+ }
+ advance(lexer);
+ break;
+ case '{':
+ advance(lexer);
+ if (lexer->lookahead == '$') {
+ return has_content;
+ }
+ break;
+ default:
+ if (lexer->eof(lexer)) {
+ return false;
+ }
+ advance(lexer);
+ }
+
+ is_after_variable = false;
+ }
+
+ return false;
+}
+
+static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
+ const bool is_error_recovery = valid_symbols[SENTINEL_ERROR];
+
+ if (is_error_recovery) {
+ return false;
+ }
+
+ scanner->has_leading_whitespace = false;
+
+ lexer->mark_end(lexer);
+
+ if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE]) {
+ lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE;
+ return scan_encapsed_part_string(scanner, lexer,
+ /* is_after_variable */ true,
+ /* is_heredoc */ false,
+ /* is_execution_string */ false);
+ }
+
+ if (valid_symbols[ENCAPSED_STRING_CHARS]) {
+ lexer->result_symbol = ENCAPSED_STRING_CHARS;
+ return scan_encapsed_part_string(scanner, lexer,
+ /* is_after_variable */ false,
+ /* is_heredoc */ false,
+ /* is_execution_string */ false);
+ }
+
+ if (valid_symbols[EXECUTION_STRING_CHARS_AFTER_VARIABLE]) {
+ lexer->result_symbol = EXECUTION_STRING_CHARS_AFTER_VARIABLE;
+ return scan_encapsed_part_string(scanner, lexer,
+ /* is_after_variable */ true,
+ /* is_heredoc */ false,
+ /* is_execution_string */ true);
+ }
+
+ if (valid_symbols[EXECUTION_STRING_CHARS]) {
+ lexer->result_symbol = EXECUTION_STRING_CHARS;
+ return scan_encapsed_part_string(scanner, lexer,
+ /* is_after_variable */ false,
+ /* is_heredoc */ false,
+ /* is_execution_string */ true);
+ }
+
+ if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC]) {
+ lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC;
+ return scan_encapsed_part_string(scanner, lexer,
+ /* is_after_variable */ true,
+ /* is_heredoc */ true,
+ /* is_execution_string */ false);
+ }
+
+ if (valid_symbols[ENCAPSED_STRING_CHARS_HEREDOC]) {
+ lexer->result_symbol = ENCAPSED_STRING_CHARS_HEREDOC;
+ return scan_encapsed_part_string(scanner, lexer,
+ /* is_after_variable */ false,
+ /* is_heredoc */ true,
+ /* is_execution_string */ false);
+ }
+
+ if (valid_symbols[NOWDOC_STRING]) {
+ lexer->result_symbol = NOWDOC_STRING;
+ return scan_nowdoc_string(scanner, lexer);
+ }
+
+ if (valid_symbols[HEREDOC_END]) {
+ lexer->result_symbol = HEREDOC_END;
+ if (scanner->heredocs.size == 0) {
+ return false;
+ }
+
+ Heredoc heredoc = *array_back(&scanner->heredocs);
+
+ while (iswspace(lexer->lookahead)) {
+ skip(lexer);
+ }
+
+ String word = scan_heredoc_word(lexer);
+ if (!string_eq(&word, &heredoc.word)) {
+ array_delete(&word);
+ return false;
+ }
+ array_delete(&word);
+
+ lexer->mark_end(lexer);
+ array_delete(&array_pop(&scanner->heredocs).word);
+ return true;
+ }
+
+ if (!scan_whitespace(lexer)) {
+ return false;
+ }
+
+ if (valid_symbols[EOF_TOKEN] && lexer->eof(lexer)) {
+ lexer->result_symbol = EOF_TOKEN;
+ return true;
+ }
+
+ if (valid_symbols[HEREDOC_START]) {
+ lexer->result_symbol = HEREDOC_START;
+ Heredoc heredoc = heredoc_new();
+
+ while (iswspace(lexer->lookahead)) {
+ skip(lexer);
+ }
+
+ heredoc.word = scan_heredoc_word(lexer);
+ if (heredoc.word.size == 0) {
+ array_delete(&heredoc.word);
+ return false;
+ }
+ lexer->mark_end(lexer);
+
+ array_push(&scanner->heredocs, heredoc);
+ return true;
+ }
+
+ if (valid_symbols[AUTOMATIC_SEMICOLON]) {
+ lexer->result_symbol = AUTOMATIC_SEMICOLON;
+
+ if (lexer->lookahead != '?') {
+ return false;
+ }
+
+ advance(lexer);
+
+ return lexer->lookahead == '>';
+ }
+
+ return false;
+}
+
+static inline void *external_scanner_create() {
+ Scanner *scanner = ts_calloc(1, sizeof(Scanner));
+ array_init(&scanner->heredocs);
+ return scanner;
+}
+
+static inline unsigned external_scanner_serialize(void *payload, char *buffer) {
+ Scanner *scanner = (Scanner *)payload;
+ return serialize(scanner, buffer);
+}
+
+static inline void external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
+ Scanner *scanner = (Scanner *)payload;
+ deserialize(scanner, buffer, length);
+}
+
+static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
+ Scanner *scanner = (Scanner *)payload;
+ return scan(scanner, lexer, valid_symbols);
+}
+
+static inline void external_scanner_destroy(void *payload) {
+ Scanner *scanner = (Scanner *)payload;
+ for (size_t i = 0; i < scanner->heredocs.size; i++) {
+ array_delete(&scanner->heredocs.contents[i].word);
+ }
+ array_delete(&scanner->heredocs);
+ ts_free(scanner);
+}