queries
bash.scm c.scm cpp.scm css.scm dockerfile.scm go.scm html.scm javascript.scm lua.scm markdown.scm php.scm python.scm sql.scm tsx.scm typescript.scmsamples
format.txt lsp.c ollama.py test.c test.cpp test.css test.dockerfile test.html test.js test.lua test.md test.php test.py test.rb test.sh test.sql test.ts test.tsxvendor
github.com
mitjafelicijan
go-tree-sitter
.gitignore LICENSE Makefile README.md alloc.c alloc.h api.h array.h atomic.h bindings.c bindings.go bindings.h bits.h clock.h error_costs.h get_changed_ranges.c get_changed_ranges.h host.h iter.go language.c language.h length.h lexer.c lexer.h node.c parser.c parser.h point.h ptypes.h query.c reduce_action.h reusable_node.h stack.c stack.h subtree.c subtree.h test_grammar.go test_grammar.js test_grammar_generate.sh tree.c tree.h tree_cursor.c tree_cursor.h umachine.h unicode.h urename.h utf.h utf16.h utf8.h wasm_store.c wasm_store.hnsf
termbox-go
AUTHORS LICENSE README.md api.go api_common.go api_windows.go collect_terminfo.py escwait.go escwait_darwin.go syscalls_darwin.go syscalls_darwin_amd64.go syscalls_dragonfly.go syscalls_freebsd.go syscalls_linux.go syscalls_netbsd.go syscalls_openbsd.go syscalls_windows.go termbox.go termbox_common.go termbox_windows.go terminfo.go terminfo_builtin.go
vendor/github.com/mitjafelicijan/go-tree-sitter/php/scanner.h
raw
1#include "tree_sitter/array.h"
2#include "tree_sitter/parser.h"
3
4#include <string.h>
5#include <wchar.h>
6#include <wctype.h>
7
8enum TokenType {
9 AUTOMATIC_SEMICOLON,
10 ENCAPSED_STRING_CHARS,
11 ENCAPSED_STRING_CHARS_AFTER_VARIABLE,
12 EXECUTION_STRING_CHARS,
13 EXECUTION_STRING_CHARS_AFTER_VARIABLE,
14 ENCAPSED_STRING_CHARS_HEREDOC,
15 ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC,
16 EOF_TOKEN,
17 HEREDOC_START,
18 HEREDOC_END,
19 NOWDOC_STRING,
20 SENTINEL_ERROR, // Unused token used to indicate error recovery mode
21};
22
23typedef Array(int32_t) String;
24
25static inline bool string_eq(String *self, String *other) {
26 if (self->size != other->size) {
27 return false;
28 }
29 if (self->size == 0) {
30 return self->size == other->size;
31 }
32 return memcmp(self->contents, other->contents, self->size * sizeof(self->contents[0])) == 0;
33}
34
35typedef struct {
36 bool end_word_indentation_allowed;
37 String word;
38} Heredoc;
39
40#define heredoc_new() \
41 { \
42 .end_word_indentation_allowed = false, \
43 .word = array_new(), \
44 };
45
46typedef struct {
47 bool has_leading_whitespace;
48 Array(Heredoc) heredocs;
49} Scanner;
50
51typedef enum { Error, End } ScanContentResult;
52
53static inline void reset_heredoc(Heredoc *heredoc) {
54 array_delete(&heredoc->word);
55 heredoc->end_word_indentation_allowed = false;
56}
57
58static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
59
60static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
61
62static unsigned serialize(Scanner *scanner, char *buffer) {
63 unsigned size = 0;
64
65 buffer[size++] = (char)scanner->heredocs.size;
66 for (unsigned j = 0; j < scanner->heredocs.size; j++) {
67 Heredoc *heredoc = &scanner->heredocs.contents[j];
68 unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
69 if (size + 5 + word_size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
70 return 0;
71 }
72 buffer[size++] = (char)heredoc->end_word_indentation_allowed;
73 memcpy(&buffer[size], &heredoc->word.size, sizeof(uint32_t));
74 size += sizeof(uint32_t);
75 if (heredoc->word.size > 0) {
76 memcpy(&buffer[size], heredoc->word.contents, word_size);
77 size += word_size;
78 }
79 }
80
81 return size;
82}
83
84static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
85 unsigned size = 0;
86 scanner->has_leading_whitespace = false;
87
88 for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
89 reset_heredoc(array_get(&scanner->heredocs, i));
90 }
91
92 if (length == 0) {
93 return;
94 }
95
96 uint8_t open_heredoc_count = buffer[size++];
97 for (unsigned i = 0; i < open_heredoc_count; i++) {
98 Heredoc *heredoc = NULL;
99 if (i < scanner->heredocs.size) {
100 heredoc = array_get(&scanner->heredocs, i);
101 } else {
102 Heredoc new_heredoc = heredoc_new();
103 array_push(&scanner->heredocs, new_heredoc);
104 heredoc = array_back(&scanner->heredocs);
105 }
106
107 heredoc->end_word_indentation_allowed = buffer[size++];
108 memcpy(&heredoc->word.size, &buffer[size], sizeof(uint32_t));
109 size += sizeof(uint32_t);
110 unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
111 if (word_size > 0) {
112 array_reserve(&heredoc->word, heredoc->word.size);
113 memcpy(heredoc->word.contents, &buffer[size], word_size);
114 size += word_size;
115 }
116 }
117
118 assert(size == length);
119}
120
121static inline bool scan_whitespace(TSLexer *lexer) {
122 for (;;) {
123 while (iswspace(lexer->lookahead)) {
124 advance(lexer);
125 }
126
127 if (lexer->lookahead == '/') {
128 advance(lexer);
129
130 if (lexer->lookahead == '/') {
131 advance(lexer);
132 while (lexer->lookahead != 0 && lexer->lookahead != '\n') {
133 advance(lexer);
134 }
135 } else {
136 return false;
137 }
138 } else {
139 return true;
140 }
141 }
142}
143
144static inline bool is_valid_name_char(TSLexer *lexer) {
145 return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80;
146}
147
148static inline bool is_escapable_sequence(TSLexer *lexer) {
149 // Note: remember to also update the escape_sequence rule in the
150 // main grammar whenever changing this method
151 int32_t letter = lexer->lookahead;
152
153 if (letter == 'n' || letter == 'r' || letter == 't' || letter == 'v' || letter == 'e' || letter == 'f' ||
154 letter == '\\' || letter == '$' || letter == '"') {
155 return true;
156 }
157
158 // Hex
159 if (letter == 'x') {
160 advance(lexer);
161 return iswxdigit(lexer->lookahead);
162 }
163
164 // Unicode
165 if (letter == 'u') {
166 return true; // We handle the case where this is not really an escape
167 // sequence in grammar.js - this is needed to support the
168 // edge case "\u{$a}" in which case "\u" is to be
169 // interpreted as characters and {$a} as a variable
170 }
171
172 // Octal
173 return iswdigit(lexer->lookahead) && lexer->lookahead >= '0' && lexer->lookahead <= '7';
174}
175
176static String scan_heredoc_word(TSLexer *lexer) {
177 String result = (String)array_new();
178
179 while (is_valid_name_char(lexer)) {
180 array_push(&result, lexer->lookahead);
181 advance(lexer);
182 }
183
184 return result;
185}
186
187static inline bool scan_nowdoc_string(Scanner *scanner, TSLexer *lexer) {
188 bool has_consumed_content = false;
189 if (scanner->heredocs.size == 0) {
190 return false;
191 }
192
193 // While PHP requires the nowdoc end tag to be the very first on a new line,
194 // there may be an arbitrary amount of whitespace before the closing token
195 while (iswspace(lexer->lookahead)) {
196 advance(lexer);
197 has_consumed_content = true;
198 }
199
200 bool end_tag_matched = false;
201 String heredoc_tag = array_back(&scanner->heredocs)->word;
202
203 for (uint32_t i = 0; i < heredoc_tag.size; i++) {
204 if (lexer->lookahead != heredoc_tag.contents[i]) {
205 break;
206 }
207 advance(lexer);
208 has_consumed_content = true;
209
210 end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
211 lexer->lookahead == ',' || lexer->lookahead == ')'));
212 }
213
214 if (end_tag_matched) {
215 // There may be an arbitrary amount of white space after the end tag
216 while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
217 advance(lexer);
218 has_consumed_content = true;
219 }
220
221 // Return to allow the end tag parsing if we've encountered an end tag
222 // at a valid position
223 if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == '\n' ||
224 lexer->lookahead == '\r') {
225 // , and ) is needed to support heredoc in function arguments
226 return false;
227 }
228 }
229
230 for (bool has_content = has_consumed_content;; has_content = true) {
231 lexer->mark_end(lexer);
232
233 switch (lexer->lookahead) {
234 case '\n':
235 case '\r':
236 return has_content;
237 default:
238 if (lexer->eof(lexer)) {
239 return false;
240 }
241 advance(lexer);
242 }
243 }
244
245 return false;
246}
247
248static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_after_variable, bool is_heredoc,
249 bool is_execution_string) {
250 bool has_consumed_content = false;
251
252 if (is_heredoc && scanner->heredocs.size > 0) {
253 // While PHP requires the heredoc end tag to be the very first on a new
254 // line, there may be an arbitrary amount of whitespace before the
255 // closing token However, we should not consume \r or \n
256 while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
257 advance(lexer);
258 has_consumed_content = true;
259 }
260
261 String heredoc_tag = array_back(&scanner->heredocs)->word;
262
263 bool end_tag_matched = false;
264
265 for (uint32_t i = 0; i < heredoc_tag.size; i++) {
266 if (lexer->lookahead != heredoc_tag.contents[i]) {
267 break;
268 }
269 has_consumed_content = true;
270 advance(lexer);
271
272 end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
273 lexer->lookahead == ',' || lexer->lookahead == ')'));
274 }
275
276 if (end_tag_matched) {
277 // There may be an arbitrary amount of white space after the end tag
278 // However, we should not consume \r or \n
279 while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
280 advance(lexer);
281 has_consumed_content = true;
282 }
283
284 // Return to allow the end tag parsing if we've encountered an end
285 // tag at a valid position
286 if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' ||
287 lexer->lookahead == '\n' || lexer->lookahead == '\r') {
288 // , and ) is needed to support heredoc in function arguments
289 return false;
290 }
291 }
292 }
293
294 for (bool has_content = has_consumed_content;; has_content = true) {
295 lexer->mark_end(lexer);
296
297 switch (lexer->lookahead) {
298 case '"':
299 if (!is_heredoc && !is_execution_string) {
300 return has_content;
301 }
302 advance(lexer);
303 break;
304 case '`':
305 if (is_execution_string) {
306 return has_content;
307 }
308 advance(lexer);
309 break;
310 case '\n':
311 case '\r':
312 if (is_heredoc) {
313 return has_content;
314 }
315 advance(lexer);
316 break;
317 case '\\':
318 advance(lexer);
319
320 // \{ should not be interpreted as an escape sequence, but both
321 // should be consumed as normal characters
322 if (lexer->lookahead == '{') {
323 advance(lexer);
324 break;
325 }
326
327 if (is_execution_string && lexer->lookahead == '`') {
328 return has_content;
329 }
330
331 if (is_heredoc && lexer->lookahead == '\\') {
332 advance(lexer);
333 break;
334 }
335
336 if (is_escapable_sequence(lexer)) {
337 return has_content;
338 }
339 break;
340 case '$':
341 advance(lexer);
342
343 if ((is_valid_name_char(lexer) && !iswdigit(lexer->lookahead)) || lexer->lookahead == '{') {
344 return has_content;
345 }
346 break;
347 case '-':
348 if (is_after_variable) {
349 advance(lexer);
350 if (lexer->lookahead == '>') {
351 advance(lexer);
352 if (is_valid_name_char(lexer)) {
353 return has_content;
354 }
355 break;
356 }
357 break;
358 }
359 case '[':
360 if (is_after_variable) {
361 return has_content;
362 }
363 advance(lexer);
364 break;
365 case '{':
366 advance(lexer);
367 if (lexer->lookahead == '$') {
368 return has_content;
369 }
370 break;
371 default:
372 if (lexer->eof(lexer)) {
373 return false;
374 }
375 advance(lexer);
376 }
377
378 is_after_variable = false;
379 }
380
381 return false;
382}
383
384static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
385 const bool is_error_recovery = valid_symbols[SENTINEL_ERROR];
386
387 if (is_error_recovery) {
388 return false;
389 }
390
391 scanner->has_leading_whitespace = false;
392
393 lexer->mark_end(lexer);
394
395 if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE]) {
396 lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE;
397 return scan_encapsed_part_string(scanner, lexer,
398 /* is_after_variable */ true,
399 /* is_heredoc */ false,
400 /* is_execution_string */ false);
401 }
402
403 if (valid_symbols[ENCAPSED_STRING_CHARS]) {
404 lexer->result_symbol = ENCAPSED_STRING_CHARS;
405 return scan_encapsed_part_string(scanner, lexer,
406 /* is_after_variable */ false,
407 /* is_heredoc */ false,
408 /* is_execution_string */ false);
409 }
410
411 if (valid_symbols[EXECUTION_STRING_CHARS_AFTER_VARIABLE]) {
412 lexer->result_symbol = EXECUTION_STRING_CHARS_AFTER_VARIABLE;
413 return scan_encapsed_part_string(scanner, lexer,
414 /* is_after_variable */ true,
415 /* is_heredoc */ false,
416 /* is_execution_string */ true);
417 }
418
419 if (valid_symbols[EXECUTION_STRING_CHARS]) {
420 lexer->result_symbol = EXECUTION_STRING_CHARS;
421 return scan_encapsed_part_string(scanner, lexer,
422 /* is_after_variable */ false,
423 /* is_heredoc */ false,
424 /* is_execution_string */ true);
425 }
426
427 if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC]) {
428 lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC;
429 return scan_encapsed_part_string(scanner, lexer,
430 /* is_after_variable */ true,
431 /* is_heredoc */ true,
432 /* is_execution_string */ false);
433 }
434
435 if (valid_symbols[ENCAPSED_STRING_CHARS_HEREDOC]) {
436 lexer->result_symbol = ENCAPSED_STRING_CHARS_HEREDOC;
437 return scan_encapsed_part_string(scanner, lexer,
438 /* is_after_variable */ false,
439 /* is_heredoc */ true,
440 /* is_execution_string */ false);
441 }
442
443 if (valid_symbols[NOWDOC_STRING]) {
444 lexer->result_symbol = NOWDOC_STRING;
445 return scan_nowdoc_string(scanner, lexer);
446 }
447
448 if (valid_symbols[HEREDOC_END]) {
449 lexer->result_symbol = HEREDOC_END;
450 if (scanner->heredocs.size == 0) {
451 return false;
452 }
453
454 Heredoc heredoc = *array_back(&scanner->heredocs);
455
456 while (iswspace(lexer->lookahead)) {
457 skip(lexer);
458 }
459
460 String word = scan_heredoc_word(lexer);
461 if (!string_eq(&word, &heredoc.word)) {
462 array_delete(&word);
463 return false;
464 }
465 array_delete(&word);
466
467 lexer->mark_end(lexer);
468 array_delete(&array_pop(&scanner->heredocs).word);
469 return true;
470 }
471
472 if (!scan_whitespace(lexer)) {
473 return false;
474 }
475
476 if (valid_symbols[EOF_TOKEN] && lexer->eof(lexer)) {
477 lexer->result_symbol = EOF_TOKEN;
478 return true;
479 }
480
481 if (valid_symbols[HEREDOC_START]) {
482 lexer->result_symbol = HEREDOC_START;
483 Heredoc heredoc = heredoc_new();
484
485 while (iswspace(lexer->lookahead)) {
486 skip(lexer);
487 }
488
489 heredoc.word = scan_heredoc_word(lexer);
490 if (heredoc.word.size == 0) {
491 array_delete(&heredoc.word);
492 return false;
493 }
494 lexer->mark_end(lexer);
495
496 array_push(&scanner->heredocs, heredoc);
497 return true;
498 }
499
500 if (valid_symbols[AUTOMATIC_SEMICOLON]) {
501 lexer->result_symbol = AUTOMATIC_SEMICOLON;
502
503 if (lexer->lookahead != '?') {
504 return false;
505 }
506
507 advance(lexer);
508
509 return lexer->lookahead == '>';
510 }
511
512 return false;
513}
514
515static inline void *external_scanner_create() {
516 Scanner *scanner = ts_calloc(1, sizeof(Scanner));
517 array_init(&scanner->heredocs);
518 return scanner;
519}
520
521static inline unsigned external_scanner_serialize(void *payload, char *buffer) {
522 Scanner *scanner = (Scanner *)payload;
523 return serialize(scanner, buffer);
524}
525
526static inline void external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
527 Scanner *scanner = (Scanner *)payload;
528 deserialize(scanner, buffer, length);
529}
530
531static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
532 Scanner *scanner = (Scanner *)payload;
533 return scan(scanner, lexer, valid_symbols);
534}
535
536static inline void external_scanner_destroy(void *payload) {
537 Scanner *scanner = (Scanner *)payload;
538 for (size_t i = 0; i < scanner->heredocs.size; i++) {
539 array_delete(&scanner->heredocs.contents[i].word);
540 }
541 array_delete(&scanner->heredocs);
542 ts_free(scanner);
543}