1#include "tree_sitter/alloc.h"
2#include "tree_sitter/parser.h"
3
4#include <assert.h>
5#include <string.h>
6#include <wctype.h>
7
8enum TokenType { RAW_STRING_DELIMITER, RAW_STRING_CONTENT };
9
10/// The spec limits delimiters to 16 chars
11#define MAX_DELIMITER_LENGTH 16
12
13typedef struct {
14 uint8_t delimiter_length;
15 wchar_t delimiter[MAX_DELIMITER_LENGTH];
16} Scanner;
17
18static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
19
20static inline void reset(Scanner *scanner) {
21 scanner->delimiter_length = 0;
22 memset(scanner->delimiter, 0, sizeof scanner->delimiter);
23}
24
25/// Scan the raw string delimiter in R"delimiter(content)delimiter"
26static bool scan_raw_string_delimiter(Scanner *scanner, TSLexer *lexer) {
27 if (scanner->delimiter_length > 0) {
28 // Closing delimiter: must exactly match the opening delimiter.
29 // We already checked this when scanning content, but this is how we
30 // know when to stop. We can't stop at ", because R"""hello""" is valid.
31 for (int i = 0; i < scanner->delimiter_length; ++i) {
32 if (lexer->lookahead != scanner->delimiter[i]) {
33 return false;
34 }
35 advance(lexer);
36 }
37 reset(scanner);
38 return true;
39 }
40
41 // Opening delimiter: record the d-char-sequence up to (.
42 // d-char is any basic character except parens, backslashes, and spaces.
43 for (;;) {
44 if (scanner->delimiter_length >= MAX_DELIMITER_LENGTH || lexer->eof(lexer) || lexer->lookahead == '\\' ||
45 iswspace(lexer->lookahead)) {
46 return false;
47 }
48 if (lexer->lookahead == '(') {
49 // Rather than create a token for an empty delimiter, we fail and
50 // let the grammar fall back to a delimiter-less rule.
51 return scanner->delimiter_length > 0;
52 }
53 scanner->delimiter[scanner->delimiter_length++] = lexer->lookahead;
54 advance(lexer);
55 }
56}
57
58/// Scan the raw string content in R"delimiter(content)delimiter"
59static bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
60 // The progress made through the delimiter since the last ')'.
61 // The delimiter may not contain ')' so a single counter suffices.
62 for (int delimiter_index = -1;;) {
63 // If we hit EOF, consider the content to terminate there.
64 // This forms an incomplete raw_string_literal, and models the code
65 // well.
66 if (lexer->eof(lexer)) {
67 lexer->mark_end(lexer);
68 return true;
69 }
70
71 if (delimiter_index >= 0) {
72 if (delimiter_index == scanner->delimiter_length) {
73 if (lexer->lookahead == '"') {
74 return true;
75 }
76 delimiter_index = -1;
77 } else {
78 if (lexer->lookahead == scanner->delimiter[delimiter_index]) {
79 delimiter_index += 1;
80 } else {
81 delimiter_index = -1;
82 }
83 }
84 }
85
86 if (delimiter_index == -1 && lexer->lookahead == ')') {
87 // The content doesn't include the )delimiter" part.
88 // We must still scan through it, but exclude it from the token.
89 lexer->mark_end(lexer);
90 delimiter_index = 0;
91 }
92
93 advance(lexer);
94 }
95}
96
97void *tree_sitter_cpp_external_scanner_create() {
98 Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
99 memset(scanner, 0, sizeof(Scanner));
100 return scanner;
101}
102
103bool tree_sitter_cpp_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
104 Scanner *scanner = (Scanner *)payload;
105
106 if (valid_symbols[RAW_STRING_DELIMITER] && valid_symbols[RAW_STRING_CONTENT]) {
107 // we're in error recovery
108 return false;
109 }
110
111 // No skipping leading whitespace: raw-string grammar is space-sensitive.
112 if (valid_symbols[RAW_STRING_DELIMITER]) {
113 lexer->result_symbol = RAW_STRING_DELIMITER;
114 return scan_raw_string_delimiter(scanner, lexer);
115 }
116
117 if (valid_symbols[RAW_STRING_CONTENT]) {
118 lexer->result_symbol = RAW_STRING_CONTENT;
119 return scan_raw_string_content(scanner, lexer);
120 }
121
122 return false;
123}
124
125unsigned tree_sitter_cpp_external_scanner_serialize(void *payload, char *buffer) {
126 static_assert(MAX_DELIMITER_LENGTH * sizeof(wchar_t) < TREE_SITTER_SERIALIZATION_BUFFER_SIZE,
127 "Serialized delimiter is too long!");
128
129 Scanner *scanner = (Scanner *)payload;
130 size_t size = scanner->delimiter_length * sizeof(wchar_t);
131 memcpy(buffer, scanner->delimiter, size);
132 return (unsigned)size;
133}
134
135void tree_sitter_cpp_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
136 assert(length % sizeof(wchar_t) == 0 && "Can't decode serialized delimiter!");
137
138 Scanner *scanner = (Scanner *)payload;
139 scanner->delimiter_length = length / sizeof(wchar_t);
140 if (length > 0) {
141 memcpy(&scanner->delimiter[0], buffer, length);
142 }
143}
144
145void tree_sitter_cpp_external_scanner_destroy(void *payload) {
146 Scanner *scanner = (Scanner *)payload;
147 ts_free(scanner);
148}