summaryrefslogtreecommitdiff
path: root/vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h')
-rw-r--r--vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h543
1 files changed, 543 insertions, 0 deletions
diff --git a/vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h b/vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h
new file mode 100644
index 0000000..e16a21e
--- /dev/null
+++ b/vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h
@@ -0,0 +1,543 @@
1#include "tree_sitter/array.h"
2#include "tree_sitter/parser.h"
3
4#include <string.h>
5#include <wchar.h>
6#include <wctype.h>
7
8enum TokenType {
9 AUTOMATIC_SEMICOLON,
10 ENCAPSED_STRING_CHARS,
11 ENCAPSED_STRING_CHARS_AFTER_VARIABLE,
12 EXECUTION_STRING_CHARS,
13 EXECUTION_STRING_CHARS_AFTER_VARIABLE,
14 ENCAPSED_STRING_CHARS_HEREDOC,
15 ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC,
16 EOF_TOKEN,
17 HEREDOC_START,
18 HEREDOC_END,
19 NOWDOC_STRING,
20 SENTINEL_ERROR, // Unused token used to indicate error recovery mode
21};
22
23typedef Array(int32_t) String;
24
25static inline bool string_eq(String *self, String *other) {
26 if (self->size != other->size) {
27 return false;
28 }
29 if (self->size == 0) {
30 return self->size == other->size;
31 }
32 return memcmp(self->contents, other->contents, self->size * sizeof(self->contents[0])) == 0;
33}
34
35typedef struct {
36 bool end_word_indentation_allowed;
37 String word;
38} Heredoc;
39
40#define heredoc_new() \
41 { \
42 .end_word_indentation_allowed = false, \
43 .word = array_new(), \
44 };
45
46typedef struct {
47 bool has_leading_whitespace;
48 Array(Heredoc) heredocs;
49} Scanner;
50
51typedef enum { Error, End } ScanContentResult;
52
53static inline void reset_heredoc(Heredoc *heredoc) {
54 array_delete(&heredoc->word);
55 heredoc->end_word_indentation_allowed = false;
56}
57
58static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
59
60static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
61
62static unsigned serialize(Scanner *scanner, char *buffer) {
63 unsigned size = 0;
64
65 buffer[size++] = (char)scanner->heredocs.size;
66 for (unsigned j = 0; j < scanner->heredocs.size; j++) {
67 Heredoc *heredoc = &scanner->heredocs.contents[j];
68 unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
69 if (size + 5 + word_size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
70 return 0;
71 }
72 buffer[size++] = (char)heredoc->end_word_indentation_allowed;
73 memcpy(&buffer[size], &heredoc->word.size, sizeof(uint32_t));
74 size += sizeof(uint32_t);
75 if (heredoc->word.size > 0) {
76 memcpy(&buffer[size], heredoc->word.contents, word_size);
77 size += word_size;
78 }
79 }
80
81 return size;
82}
83
84static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
85 unsigned size = 0;
86 scanner->has_leading_whitespace = false;
87
88 for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
89 reset_heredoc(array_get(&scanner->heredocs, i));
90 }
91
92 if (length == 0) {
93 return;
94 }
95
96 uint8_t open_heredoc_count = buffer[size++];
97 for (unsigned i = 0; i < open_heredoc_count; i++) {
98 Heredoc *heredoc = NULL;
99 if (i < scanner->heredocs.size) {
100 heredoc = array_get(&scanner->heredocs, i);
101 } else {
102 Heredoc new_heredoc = heredoc_new();
103 array_push(&scanner->heredocs, new_heredoc);
104 heredoc = array_back(&scanner->heredocs);
105 }
106
107 heredoc->end_word_indentation_allowed = buffer[size++];
108 memcpy(&heredoc->word.size, &buffer[size], sizeof(uint32_t));
109 size += sizeof(uint32_t);
110 unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
111 if (word_size > 0) {
112 array_reserve(&heredoc->word, heredoc->word.size);
113 memcpy(heredoc->word.contents, &buffer[size], word_size);
114 size += word_size;
115 }
116 }
117
118 assert(size == length);
119}
120
121static inline bool scan_whitespace(TSLexer *lexer) {
122 for (;;) {
123 while (iswspace(lexer->lookahead)) {
124 advance(lexer);
125 }
126
127 if (lexer->lookahead == '/') {
128 advance(lexer);
129
130 if (lexer->lookahead == '/') {
131 advance(lexer);
132 while (lexer->lookahead != 0 && lexer->lookahead != '\n') {
133 advance(lexer);
134 }
135 } else {
136 return false;
137 }
138 } else {
139 return true;
140 }
141 }
142}
143
144static inline bool is_valid_name_char(TSLexer *lexer) {
145 return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80;
146}
147
148static inline bool is_escapable_sequence(TSLexer *lexer) {
149 // Note: remember to also update the escape_sequence rule in the
150 // main grammar whenever changing this method
151 int32_t letter = lexer->lookahead;
152
153 if (letter == 'n' || letter == 'r' || letter == 't' || letter == 'v' || letter == 'e' || letter == 'f' ||
154 letter == '\\' || letter == '$' || letter == '"') {
155 return true;
156 }
157
158 // Hex
159 if (letter == 'x') {
160 advance(lexer);
161 return iswxdigit(lexer->lookahead);
162 }
163
164 // Unicode
165 if (letter == 'u') {
166 return true; // We handle the case where this is not really an escape
167 // sequence in grammar.js - this is needed to support the
168 // edge case "\u{$a}" in which case "\u" is to be
169 // interpreted as characters and {$a} as a variable
170 }
171
172 // Octal
173 return iswdigit(lexer->lookahead) && lexer->lookahead >= '0' && lexer->lookahead <= '7';
174}
175
176static String scan_heredoc_word(TSLexer *lexer) {
177 String result = (String)array_new();
178
179 while (is_valid_name_char(lexer)) {
180 array_push(&result, lexer->lookahead);
181 advance(lexer);
182 }
183
184 return result;
185}
186
187static inline bool scan_nowdoc_string(Scanner *scanner, TSLexer *lexer) {
188 bool has_consumed_content = false;
189 if (scanner->heredocs.size == 0) {
190 return false;
191 }
192
193 // While PHP requires the nowdoc end tag to be the very first on a new line,
194 // there may be an arbitrary amount of whitespace before the closing token
195 while (iswspace(lexer->lookahead)) {
196 advance(lexer);
197 has_consumed_content = true;
198 }
199
200 bool end_tag_matched = false;
201 String heredoc_tag = array_back(&scanner->heredocs)->word;
202
203 for (uint32_t i = 0; i < heredoc_tag.size; i++) {
204 if (lexer->lookahead != heredoc_tag.contents[i]) {
205 break;
206 }
207 advance(lexer);
208 has_consumed_content = true;
209
210 end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
211 lexer->lookahead == ',' || lexer->lookahead == ')'));
212 }
213
214 if (end_tag_matched) {
215 // There may be an arbitrary amount of white space after the end tag
216 while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
217 advance(lexer);
218 has_consumed_content = true;
219 }
220
221 // Return to allow the end tag parsing if we've encountered an end tag
222 // at a valid position
223 if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == '\n' ||
224 lexer->lookahead == '\r') {
225 // , and ) is needed to support heredoc in function arguments
226 return false;
227 }
228 }
229
230 for (bool has_content = has_consumed_content;; has_content = true) {
231 lexer->mark_end(lexer);
232
233 switch (lexer->lookahead) {
234 case '\n':
235 case '\r':
236 return has_content;
237 default:
238 if (lexer->eof(lexer)) {
239 return false;
240 }
241 advance(lexer);
242 }
243 }
244
245 return false;
246}
247
248static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_after_variable, bool is_heredoc,
249 bool is_execution_string) {
250 bool has_consumed_content = false;
251
252 if (is_heredoc && scanner->heredocs.size > 0) {
253 // While PHP requires the heredoc end tag to be the very first on a new
254 // line, there may be an arbitrary amount of whitespace before the
255 // closing token However, we should not consume \r or \n
256 while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
257 advance(lexer);
258 has_consumed_content = true;
259 }
260
261 String heredoc_tag = array_back(&scanner->heredocs)->word;
262
263 bool end_tag_matched = false;
264
265 for (uint32_t i = 0; i < heredoc_tag.size; i++) {
266 if (lexer->lookahead != heredoc_tag.contents[i]) {
267 break;
268 }
269 has_consumed_content = true;
270 advance(lexer);
271
272 end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
273 lexer->lookahead == ',' || lexer->lookahead == ')'));
274 }
275
276 if (end_tag_matched) {
277 // There may be an arbitrary amount of white space after the end tag
278 // However, we should not consume \r or \n
279 while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
280 advance(lexer);
281 has_consumed_content = true;
282 }
283
284 // Return to allow the end tag parsing if we've encountered an end
285 // tag at a valid position
286 if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' ||
287 lexer->lookahead == '\n' || lexer->lookahead == '\r') {
288 // , and ) is needed to support heredoc in function arguments
289 return false;
290 }
291 }
292 }
293
294 for (bool has_content = has_consumed_content;; has_content = true) {
295 lexer->mark_end(lexer);
296
297 switch (lexer->lookahead) {
298 case '"':
299 if (!is_heredoc && !is_execution_string) {
300 return has_content;
301 }
302 advance(lexer);
303 break;
304 case '`':
305 if (is_execution_string) {
306 return has_content;
307 }
308 advance(lexer);
309 break;
310 case '\n':
311 case '\r':
312 if (is_heredoc) {
313 return has_content;
314 }
315 advance(lexer);
316 break;
317 case '\\':
318 advance(lexer);
319
320 // \{ should not be interpreted as an escape sequence, but both
321 // should be consumed as normal characters
322 if (lexer->lookahead == '{') {
323 advance(lexer);
324 break;
325 }
326
327 if (is_execution_string && lexer->lookahead == '`') {
328 return has_content;
329 }
330
331 if (is_heredoc && lexer->lookahead == '\\') {
332 advance(lexer);
333 break;
334 }
335
336 if (is_escapable_sequence(lexer)) {
337 return has_content;
338 }
339 break;
340 case '$':
341 advance(lexer);
342
343 if ((is_valid_name_char(lexer) && !iswdigit(lexer->lookahead)) || lexer->lookahead == '{') {
344 return has_content;
345 }
346 break;
347 case '-':
348 if (is_after_variable) {
349 advance(lexer);
350 if (lexer->lookahead == '>') {
351 advance(lexer);
352 if (is_valid_name_char(lexer)) {
353 return has_content;
354 }
355 break;
356 }
357 break;
358 }
359 case '[':
360 if (is_after_variable) {
361 return has_content;
362 }
363 advance(lexer);
364 break;
365 case '{':
366 advance(lexer);
367 if (lexer->lookahead == '$') {
368 return has_content;
369 }
370 break;
371 default:
372 if (lexer->eof(lexer)) {
373 return false;
374 }
375 advance(lexer);
376 }
377
378 is_after_variable = false;
379 }
380
381 return false;
382}
383
384static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
385 const bool is_error_recovery = valid_symbols[SENTINEL_ERROR];
386
387 if (is_error_recovery) {
388 return false;
389 }
390
391 scanner->has_leading_whitespace = false;
392
393 lexer->mark_end(lexer);
394
395 if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE]) {
396 lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE;
397 return scan_encapsed_part_string(scanner, lexer,
398 /* is_after_variable */ true,
399 /* is_heredoc */ false,
400 /* is_execution_string */ false);
401 }
402
403 if (valid_symbols[ENCAPSED_STRING_CHARS]) {
404 lexer->result_symbol = ENCAPSED_STRING_CHARS;
405 return scan_encapsed_part_string(scanner, lexer,
406 /* is_after_variable */ false,
407 /* is_heredoc */ false,
408 /* is_execution_string */ false);
409 }
410
411 if (valid_symbols[EXECUTION_STRING_CHARS_AFTER_VARIABLE]) {
412 lexer->result_symbol = EXECUTION_STRING_CHARS_AFTER_VARIABLE;
413 return scan_encapsed_part_string(scanner, lexer,
414 /* is_after_variable */ true,
415 /* is_heredoc */ false,
416 /* is_execution_string */ true);
417 }
418
419 if (valid_symbols[EXECUTION_STRING_CHARS]) {
420 lexer->result_symbol = EXECUTION_STRING_CHARS;
421 return scan_encapsed_part_string(scanner, lexer,
422 /* is_after_variable */ false,
423 /* is_heredoc */ false,
424 /* is_execution_string */ true);
425 }
426
427 if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC]) {
428 lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC;
429 return scan_encapsed_part_string(scanner, lexer,
430 /* is_after_variable */ true,
431 /* is_heredoc */ true,
432 /* is_execution_string */ false);
433 }
434
435 if (valid_symbols[ENCAPSED_STRING_CHARS_HEREDOC]) {
436 lexer->result_symbol = ENCAPSED_STRING_CHARS_HEREDOC;
437 return scan_encapsed_part_string(scanner, lexer,
438 /* is_after_variable */ false,
439 /* is_heredoc */ true,
440 /* is_execution_string */ false);
441 }
442
443 if (valid_symbols[NOWDOC_STRING]) {
444 lexer->result_symbol = NOWDOC_STRING;
445 return scan_nowdoc_string(scanner, lexer);
446 }
447
448 if (valid_symbols[HEREDOC_END]) {
449 lexer->result_symbol = HEREDOC_END;
450 if (scanner->heredocs.size == 0) {
451 return false;
452 }
453
454 Heredoc heredoc = *array_back(&scanner->heredocs);
455
456 while (iswspace(lexer->lookahead)) {
457 skip(lexer);
458 }
459
460 String word = scan_heredoc_word(lexer);
461 if (!string_eq(&word, &heredoc.word)) {
462 array_delete(&word);
463 return false;
464 }
465 array_delete(&word);
466
467 lexer->mark_end(lexer);
468 array_delete(&array_pop(&scanner->heredocs).word);
469 return true;
470 }
471
472 if (!scan_whitespace(lexer)) {
473 return false;
474 }
475
476 if (valid_symbols[EOF_TOKEN] && lexer->eof(lexer)) {
477 lexer->result_symbol = EOF_TOKEN;
478 return true;
479 }
480
481 if (valid_symbols[HEREDOC_START]) {
482 lexer->result_symbol = HEREDOC_START;
483 Heredoc heredoc = heredoc_new();
484
485 while (iswspace(lexer->lookahead)) {
486 skip(lexer);
487 }
488
489 heredoc.word = scan_heredoc_word(lexer);
490 if (heredoc.word.size == 0) {
491 array_delete(&heredoc.word);
492 return false;
493 }
494 lexer->mark_end(lexer);
495
496 array_push(&scanner->heredocs, heredoc);
497 return true;
498 }
499
500 if (valid_symbols[AUTOMATIC_SEMICOLON]) {
501 lexer->result_symbol = AUTOMATIC_SEMICOLON;
502
503 if (lexer->lookahead != '?') {
504 return false;
505 }
506
507 advance(lexer);
508
509 return lexer->lookahead == '>';
510 }
511
512 return false;
513}
514
515static inline void *external_scanner_create() {
516 Scanner *scanner = ts_calloc(1, sizeof(Scanner));
517 array_init(&scanner->heredocs);
518 return scanner;
519}
520
521static inline unsigned external_scanner_serialize(void *payload, char *buffer) {
522 Scanner *scanner = (Scanner *)payload;
523 return serialize(scanner, buffer);
524}
525
526static inline void external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
527 Scanner *scanner = (Scanner *)payload;
528 deserialize(scanner, buffer, length);
529}
530
531static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
532 Scanner *scanner = (Scanner *)payload;
533 return scan(scanner, lexer, valid_symbols);
534}
535
536static inline void external_scanner_destroy(void *payload) {
537 Scanner *scanner = (Scanner *)payload;
538 for (size_t i = 0; i < scanner->heredocs.size; i++) {
539 array_delete(&scanner->heredocs.contents[i].word);
540 }
541 array_delete(&scanner->heredocs);
542 ts_free(scanner);
543}