1#include "tree_sitter/array.h"
  2#include "tree_sitter/parser.h"
  3
  4#include <string.h>
  5#include <wchar.h>
  6#include <wctype.h>
  7
  8enum TokenType {
  9    AUTOMATIC_SEMICOLON,
 10    ENCAPSED_STRING_CHARS,
 11    ENCAPSED_STRING_CHARS_AFTER_VARIABLE,
 12    EXECUTION_STRING_CHARS,
 13    EXECUTION_STRING_CHARS_AFTER_VARIABLE,
 14    ENCAPSED_STRING_CHARS_HEREDOC,
 15    ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC,
 16    EOF_TOKEN,
 17    HEREDOC_START,
 18    HEREDOC_END,
 19    NOWDOC_STRING,
 20    SENTINEL_ERROR, // Unused token used to indicate error recovery mode
 21};
 22
 23typedef Array(int32_t) String;
 24
 25static inline bool string_eq(String *self, String *other) {
 26    if (self->size != other->size) {
 27        return false;
 28    }
 29    if (self->size == 0) {
 30        return self->size == other->size;
 31    }
 32    return memcmp(self->contents, other->contents, self->size * sizeof(self->contents[0])) == 0;
 33}
 34
 35typedef struct {
 36    bool end_word_indentation_allowed;
 37    String word;
 38} Heredoc;
 39
 40#define heredoc_new()                                                                                                  \
 41    {                                                                                                                  \
 42        .end_word_indentation_allowed = false,                                                                         \
 43        .word = array_new(),                                                                                           \
 44    };
 45
 46typedef struct {
 47    bool has_leading_whitespace;
 48    Array(Heredoc) heredocs;
 49} Scanner;
 50
 51typedef enum { Error, End } ScanContentResult;
 52
 53static inline void reset_heredoc(Heredoc *heredoc) {
 54    array_delete(&heredoc->word);
 55    heredoc->end_word_indentation_allowed = false;
 56}
 57
 58static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
 59
 60static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
 61
 62static unsigned serialize(Scanner *scanner, char *buffer) {
 63    unsigned size = 0;
 64
 65    buffer[size++] = (char)scanner->heredocs.size;
 66    for (unsigned j = 0; j < scanner->heredocs.size; j++) {
 67        Heredoc *heredoc = &scanner->heredocs.contents[j];
 68        unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
 69        if (size + 5 + word_size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
 70            return 0;
 71        }
 72        buffer[size++] = (char)heredoc->end_word_indentation_allowed;
 73        memcpy(&buffer[size], &heredoc->word.size, sizeof(uint32_t));
 74        size += sizeof(uint32_t);
 75        if (heredoc->word.size > 0) {
 76            memcpy(&buffer[size], heredoc->word.contents, word_size);
 77            size += word_size;
 78        }
 79    }
 80
 81    return size;
 82}
 83
 84static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
 85    unsigned size = 0;
 86    scanner->has_leading_whitespace = false;
 87
 88    for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
 89        reset_heredoc(array_get(&scanner->heredocs, i));
 90    }
 91
 92    if (length == 0) {
 93        return;
 94    }
 95
 96    uint8_t open_heredoc_count = buffer[size++];
 97    for (unsigned i = 0; i < open_heredoc_count; i++) {
 98        Heredoc *heredoc = NULL;
 99        if (i < scanner->heredocs.size) {
100            heredoc = array_get(&scanner->heredocs, i);
101        } else {
102            Heredoc new_heredoc = heredoc_new();
103            array_push(&scanner->heredocs, new_heredoc);
104            heredoc = array_back(&scanner->heredocs);
105        }
106
107        heredoc->end_word_indentation_allowed = buffer[size++];
108        memcpy(&heredoc->word.size, &buffer[size], sizeof(uint32_t));
109        size += sizeof(uint32_t);
110        unsigned word_size = heredoc->word.size * sizeof(heredoc->word.contents[0]);
111        if (word_size > 0) {
112            array_reserve(&heredoc->word, heredoc->word.size);
113            memcpy(heredoc->word.contents, &buffer[size], word_size);
114            size += word_size;
115        }
116    }
117
118    assert(size == length);
119}
120
121static inline bool scan_whitespace(TSLexer *lexer) {
122    for (;;) {
123        while (iswspace(lexer->lookahead)) {
124            advance(lexer);
125        }
126
127        if (lexer->lookahead == '/') {
128            advance(lexer);
129
130            if (lexer->lookahead == '/') {
131                advance(lexer);
132                while (lexer->lookahead != 0 && lexer->lookahead != '\n') {
133                    advance(lexer);
134                }
135            } else {
136                return false;
137            }
138        } else {
139            return true;
140        }
141    }
142}
143
144static inline bool is_valid_name_char(TSLexer *lexer) {
145    return iswalnum(lexer->lookahead) || lexer->lookahead == '_' || lexer->lookahead >= 0x80;
146}
147
148static inline bool is_escapable_sequence(TSLexer *lexer) {
149    // Note: remember to also update the escape_sequence rule in the
150    // main grammar whenever changing this method
151    int32_t letter = lexer->lookahead;
152
153    if (letter == 'n' || letter == 'r' || letter == 't' || letter == 'v' || letter == 'e' || letter == 'f' ||
154        letter == '\\' || letter == '$' || letter == '"') {
155        return true;
156    }
157
158    // Hex
159    if (letter == 'x') {
160        advance(lexer);
161        return iswxdigit(lexer->lookahead);
162    }
163
164    // Unicode
165    if (letter == 'u') {
166        return true; // We handle the case where this is not really an escape
167                     // sequence in grammar.js - this is needed to support the
168                     // edge case "\u{$a}" in which case "\u" is to be
169                     // interpreted as characters and {$a} as a variable
170    }
171
172    // Octal
173    return iswdigit(lexer->lookahead) && lexer->lookahead >= '0' && lexer->lookahead <= '7';
174}
175
176static String scan_heredoc_word(TSLexer *lexer) {
177    String result = (String)array_new();
178
179    while (is_valid_name_char(lexer)) {
180        array_push(&result, lexer->lookahead);
181        advance(lexer);
182    }
183
184    return result;
185}
186
187static inline bool scan_nowdoc_string(Scanner *scanner, TSLexer *lexer) {
188    bool has_consumed_content = false;
189    if (scanner->heredocs.size == 0) {
190        return false;
191    }
192
193    // While PHP requires the nowdoc end tag to be the very first on a new line,
194    // there may be an arbitrary amount of whitespace before the closing token
195    while (iswspace(lexer->lookahead)) {
196        advance(lexer);
197        has_consumed_content = true;
198    }
199
200    bool end_tag_matched = false;
201    String heredoc_tag = array_back(&scanner->heredocs)->word;
202
203    for (uint32_t i = 0; i < heredoc_tag.size; i++) {
204        if (lexer->lookahead != heredoc_tag.contents[i]) {
205            break;
206        }
207        advance(lexer);
208        has_consumed_content = true;
209
210        end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
211                                                         lexer->lookahead == ',' || lexer->lookahead == ')'));
212    }
213
214    if (end_tag_matched) {
215        // There may be an arbitrary amount of white space after the end tag
216        while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
217            advance(lexer);
218            has_consumed_content = true;
219        }
220
221        // Return to allow the end tag parsing if we've encountered an end tag
222        // at a valid position
223        if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' || lexer->lookahead == '\n' ||
224            lexer->lookahead == '\r') {
225            // , and ) is needed to support heredoc in function arguments
226            return false;
227        }
228    }
229
230    for (bool has_content = has_consumed_content;; has_content = true) {
231        lexer->mark_end(lexer);
232
233        switch (lexer->lookahead) {
234            case '\n':
235            case '\r':
236                return has_content;
237            default:
238                if (lexer->eof(lexer)) {
239                    return false;
240                }
241                advance(lexer);
242        }
243    }
244
245    return false;
246}
247
248static bool scan_encapsed_part_string(Scanner *scanner, TSLexer *lexer, bool is_after_variable, bool is_heredoc,
249                                      bool is_execution_string) {
250    bool has_consumed_content = false;
251
252    if (is_heredoc && scanner->heredocs.size > 0) {
253        // While PHP requires the heredoc end tag to be the very first on a new
254        // line, there may be an arbitrary amount of whitespace before the
255        // closing token However, we should not consume \r or \n
256        while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
257            advance(lexer);
258            has_consumed_content = true;
259        }
260
261        String heredoc_tag = array_back(&scanner->heredocs)->word;
262
263        bool end_tag_matched = false;
264
265        for (uint32_t i = 0; i < heredoc_tag.size; i++) {
266            if (lexer->lookahead != heredoc_tag.contents[i]) {
267                break;
268            }
269            has_consumed_content = true;
270            advance(lexer);
271
272            end_tag_matched = (i == heredoc_tag.size - 1 && (iswspace(lexer->lookahead) || lexer->lookahead == ';' ||
273                                                             lexer->lookahead == ',' || lexer->lookahead == ')'));
274        }
275
276        if (end_tag_matched) {
277            // There may be an arbitrary amount of white space after the end tag
278            // However, we should not consume \r or \n
279            while (iswspace(lexer->lookahead) && lexer->lookahead != '\r' && lexer->lookahead != '\n') {
280                advance(lexer);
281                has_consumed_content = true;
282            }
283
284            // Return to allow the end tag parsing if we've encountered an end
285            // tag at a valid position
286            if (lexer->lookahead == ';' || lexer->lookahead == ',' || lexer->lookahead == ')' ||
287                lexer->lookahead == '\n' || lexer->lookahead == '\r') {
288                // , and ) is needed to support heredoc in function arguments
289                return false;
290            }
291        }
292    }
293
294    for (bool has_content = has_consumed_content;; has_content = true) {
295        lexer->mark_end(lexer);
296
297        switch (lexer->lookahead) {
298            case '"':
299                if (!is_heredoc && !is_execution_string) {
300                    return has_content;
301                }
302                advance(lexer);
303                break;
304            case '`':
305                if (is_execution_string) {
306                    return has_content;
307                }
308                advance(lexer);
309                break;
310            case '\n':
311            case '\r':
312                if (is_heredoc) {
313                    return has_content;
314                }
315                advance(lexer);
316                break;
317            case '\\':
318                advance(lexer);
319
320                // \{ should not be interpreted as an escape sequence, but both
321                // should be consumed as normal characters
322                if (lexer->lookahead == '{') {
323                    advance(lexer);
324                    break;
325                }
326
327                if (is_execution_string && lexer->lookahead == '`') {
328                    return has_content;
329                }
330
331                if (is_heredoc && lexer->lookahead == '\\') {
332                    advance(lexer);
333                    break;
334                }
335
336                if (is_escapable_sequence(lexer)) {
337                    return has_content;
338                }
339                break;
340            case '$':
341                advance(lexer);
342
343                if ((is_valid_name_char(lexer) && !iswdigit(lexer->lookahead)) || lexer->lookahead == '{') {
344                    return has_content;
345                }
346                break;
347            case '-':
348                if (is_after_variable) {
349                    advance(lexer);
350                    if (lexer->lookahead == '>') {
351                        advance(lexer);
352                        if (is_valid_name_char(lexer)) {
353                            return has_content;
354                        }
355                        break;
356                    }
357                    break;
358                }
359            case '[':
360                if (is_after_variable) {
361                    return has_content;
362                }
363                advance(lexer);
364                break;
365            case '{':
366                advance(lexer);
367                if (lexer->lookahead == '$') {
368                    return has_content;
369                }
370                break;
371            default:
372                if (lexer->eof(lexer)) {
373                    return false;
374                }
375                advance(lexer);
376        }
377
378        is_after_variable = false;
379    }
380
381    return false;
382}
383
384static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
385    const bool is_error_recovery = valid_symbols[SENTINEL_ERROR];
386
387    if (is_error_recovery) {
388        return false;
389    }
390
391    scanner->has_leading_whitespace = false;
392
393    lexer->mark_end(lexer);
394
395    if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE]) {
396        lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE;
397        return scan_encapsed_part_string(scanner, lexer,
398                                         /* is_after_variable */ true,
399                                         /* is_heredoc */ false,
400                                         /* is_execution_string */ false);
401    }
402
403    if (valid_symbols[ENCAPSED_STRING_CHARS]) {
404        lexer->result_symbol = ENCAPSED_STRING_CHARS;
405        return scan_encapsed_part_string(scanner, lexer,
406                                         /* is_after_variable */ false,
407                                         /* is_heredoc */ false,
408                                         /* is_execution_string */ false);
409    }
410
411    if (valid_symbols[EXECUTION_STRING_CHARS_AFTER_VARIABLE]) {
412        lexer->result_symbol = EXECUTION_STRING_CHARS_AFTER_VARIABLE;
413        return scan_encapsed_part_string(scanner, lexer,
414                                         /* is_after_variable */ true,
415                                         /* is_heredoc */ false,
416                                         /* is_execution_string */ true);
417    }
418
419    if (valid_symbols[EXECUTION_STRING_CHARS]) {
420        lexer->result_symbol = EXECUTION_STRING_CHARS;
421        return scan_encapsed_part_string(scanner, lexer,
422                                         /* is_after_variable */ false,
423                                         /* is_heredoc */ false,
424                                         /* is_execution_string */ true);
425    }
426
427    if (valid_symbols[ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC]) {
428        lexer->result_symbol = ENCAPSED_STRING_CHARS_AFTER_VARIABLE_HEREDOC;
429        return scan_encapsed_part_string(scanner, lexer,
430                                         /* is_after_variable */ true,
431                                         /* is_heredoc */ true,
432                                         /* is_execution_string */ false);
433    }
434
435    if (valid_symbols[ENCAPSED_STRING_CHARS_HEREDOC]) {
436        lexer->result_symbol = ENCAPSED_STRING_CHARS_HEREDOC;
437        return scan_encapsed_part_string(scanner, lexer,
438                                         /* is_after_variable */ false,
439                                         /* is_heredoc */ true,
440                                         /* is_execution_string */ false);
441    }
442
443    if (valid_symbols[NOWDOC_STRING]) {
444        lexer->result_symbol = NOWDOC_STRING;
445        return scan_nowdoc_string(scanner, lexer);
446    }
447
448    if (valid_symbols[HEREDOC_END]) {
449        lexer->result_symbol = HEREDOC_END;
450        if (scanner->heredocs.size == 0) {
451            return false;
452        }
453
454        Heredoc heredoc = *array_back(&scanner->heredocs);
455
456        while (iswspace(lexer->lookahead)) {
457            skip(lexer);
458        }
459
460        String word = scan_heredoc_word(lexer);
461        if (!string_eq(&word, &heredoc.word)) {
462            array_delete(&word);
463            return false;
464        }
465        array_delete(&word);
466
467        lexer->mark_end(lexer);
468        array_delete(&array_pop(&scanner->heredocs).word);
469        return true;
470    }
471
472    if (!scan_whitespace(lexer)) {
473        return false;
474    }
475
476    if (valid_symbols[EOF_TOKEN] && lexer->eof(lexer)) {
477        lexer->result_symbol = EOF_TOKEN;
478        return true;
479    }
480
481    if (valid_symbols[HEREDOC_START]) {
482        lexer->result_symbol = HEREDOC_START;
483        Heredoc heredoc = heredoc_new();
484
485        while (iswspace(lexer->lookahead)) {
486            skip(lexer);
487        }
488
489        heredoc.word = scan_heredoc_word(lexer);
490        if (heredoc.word.size == 0) {
491            array_delete(&heredoc.word);
492            return false;
493        }
494        lexer->mark_end(lexer);
495
496        array_push(&scanner->heredocs, heredoc);
497        return true;
498    }
499
500    if (valid_symbols[AUTOMATIC_SEMICOLON]) {
501        lexer->result_symbol = AUTOMATIC_SEMICOLON;
502
503        if (lexer->lookahead != '?') {
504            return false;
505        }
506
507        advance(lexer);
508
509        return lexer->lookahead == '>';
510    }
511
512    return false;
513}
514
515static inline void *external_scanner_create() {
516    Scanner *scanner = ts_calloc(1, sizeof(Scanner));
517    array_init(&scanner->heredocs);
518    return scanner;
519}
520
521static inline unsigned external_scanner_serialize(void *payload, char *buffer) {
522    Scanner *scanner = (Scanner *)payload;
523    return serialize(scanner, buffer);
524}
525
526static inline void external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
527    Scanner *scanner = (Scanner *)payload;
528    deserialize(scanner, buffer, length);
529}
530
531static inline bool external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
532    Scanner *scanner = (Scanner *)payload;
533    return scan(scanner, lexer, valid_symbols);
534}
535
536static inline void external_scanner_destroy(void *payload) {
537    Scanner *scanner = (Scanner *)payload;
538    for (size_t i = 0; i < scanner->heredocs.size; i++) {
539        array_delete(&scanner->heredocs.contents[i].word);
540    }
541    array_delete(&scanner->heredocs);
542    ts_free(scanner);
543}