1#include "tree_sitter/alloc.h"
  2#include "tree_sitter/parser.h"
  3
  4#include <assert.h>
  5#include <string.h>
  6#include <wctype.h>
  7
  8enum TokenType { RAW_STRING_DELIMITER, RAW_STRING_CONTENT };
  9
 10/// The spec limits delimiters to 16 chars
 11#define MAX_DELIMITER_LENGTH 16
 12
 13typedef struct {
 14    uint8_t delimiter_length;
 15    wchar_t delimiter[MAX_DELIMITER_LENGTH];
 16} Scanner;
 17
 18static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
 19
 20static inline void reset(Scanner *scanner) {
 21    scanner->delimiter_length = 0;
 22    memset(scanner->delimiter, 0, sizeof scanner->delimiter);
 23}
 24
 25/// Scan the raw string delimiter in R"delimiter(content)delimiter"
 26static bool scan_raw_string_delimiter(Scanner *scanner, TSLexer *lexer) {
 27    if (scanner->delimiter_length > 0) {
 28        // Closing delimiter: must exactly match the opening delimiter.
 29        // We already checked this when scanning content, but this is how we
 30        // know when to stop. We can't stop at ", because R"""hello""" is valid.
 31        for (int i = 0; i < scanner->delimiter_length; ++i) {
 32            if (lexer->lookahead != scanner->delimiter[i]) {
 33                return false;
 34            }
 35            advance(lexer);
 36        }
 37        reset(scanner);
 38        return true;
 39    }
 40
 41    // Opening delimiter: record the d-char-sequence up to (.
 42    // d-char is any basic character except parens, backslashes, and spaces.
 43    for (;;) {
 44        if (scanner->delimiter_length >= MAX_DELIMITER_LENGTH || lexer->eof(lexer) || lexer->lookahead == '\\' ||
 45            iswspace(lexer->lookahead)) {
 46            return false;
 47        }
 48        if (lexer->lookahead == '(') {
 49            // Rather than create a token for an empty delimiter, we fail and
 50            // let the grammar fall back to a delimiter-less rule.
 51            return scanner->delimiter_length > 0;
 52        }
 53        scanner->delimiter[scanner->delimiter_length++] = lexer->lookahead;
 54        advance(lexer);
 55    }
 56}
 57
 58/// Scan the raw string content in R"delimiter(content)delimiter"
 59static bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
 60    // The progress made through the delimiter since the last ')'.
 61    // The delimiter may not contain ')' so a single counter suffices.
 62    for (int delimiter_index = -1;;) {
 63        // If we hit EOF, consider the content to terminate there.
 64        // This forms an incomplete raw_string_literal, and models the code
 65        // well.
 66        if (lexer->eof(lexer)) {
 67            lexer->mark_end(lexer);
 68            return true;
 69        }
 70
 71        if (delimiter_index >= 0) {
 72            if (delimiter_index == scanner->delimiter_length) {
 73                if (lexer->lookahead == '"') {
 74                    return true;
 75                }
 76                delimiter_index = -1;
 77            } else {
 78                if (lexer->lookahead == scanner->delimiter[delimiter_index]) {
 79                    delimiter_index += 1;
 80                } else {
 81                    delimiter_index = -1;
 82                }
 83            }
 84        }
 85
 86        if (delimiter_index == -1 && lexer->lookahead == ')') {
 87            // The content doesn't include the )delimiter" part.
 88            // We must still scan through it, but exclude it from the token.
 89            lexer->mark_end(lexer);
 90            delimiter_index = 0;
 91        }
 92
 93        advance(lexer);
 94    }
 95}
 96
 97void *tree_sitter_cpp_external_scanner_create() {
 98    Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
 99    memset(scanner, 0, sizeof(Scanner));
100    return scanner;
101}
102
103bool tree_sitter_cpp_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
104    Scanner *scanner = (Scanner *)payload;
105
106    if (valid_symbols[RAW_STRING_DELIMITER] && valid_symbols[RAW_STRING_CONTENT]) {
107        // we're in error recovery
108        return false;
109    }
110
111    // No skipping leading whitespace: raw-string grammar is space-sensitive.
112    if (valid_symbols[RAW_STRING_DELIMITER]) {
113        lexer->result_symbol = RAW_STRING_DELIMITER;
114        return scan_raw_string_delimiter(scanner, lexer);
115    }
116
117    if (valid_symbols[RAW_STRING_CONTENT]) {
118        lexer->result_symbol = RAW_STRING_CONTENT;
119        return scan_raw_string_content(scanner, lexer);
120    }
121
122    return false;
123}
124
125unsigned tree_sitter_cpp_external_scanner_serialize(void *payload, char *buffer) {
126    static_assert(MAX_DELIMITER_LENGTH * sizeof(wchar_t) < TREE_SITTER_SERIALIZATION_BUFFER_SIZE,
127                  "Serialized delimiter is too long!");
128
129    Scanner *scanner = (Scanner *)payload;
130    size_t size = scanner->delimiter_length * sizeof(wchar_t);
131    memcpy(buffer, scanner->delimiter, size);
132    return (unsigned)size;
133}
134
135void tree_sitter_cpp_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
136    assert(length % sizeof(wchar_t) == 0 && "Can't decode serialized delimiter!");
137
138    Scanner *scanner = (Scanner *)payload;
139    scanner->delimiter_length = length / sizeof(wchar_t);
140    if (length > 0) {
141        memcpy(&scanner->delimiter[0], buffer, length);
142    }
143}
144
145void tree_sitter_cpp_external_scanner_destroy(void *payload) {
146    Scanner *scanner = (Scanner *)payload;
147    ts_free(scanner);
148}