1#include "parser.h"
  2
  3#include <wctype.h>
  4
  5enum TokenType {
  6    AUTOMATIC_SEMICOLON,
  7    TEMPLATE_CHARS,
  8    TERNARY_QMARK,
  9    HTML_COMMENT,
 10    LOGICAL_OR,
 11    ESCAPE_SEQUENCE,
 12    REGEX_PATTERN,
 13};
 14
 15void *tree_sitter_javascript_external_scanner_create() { return NULL; }
 16
 17void tree_sitter_javascript_external_scanner_destroy(void *p) {}
 18
 19unsigned tree_sitter_javascript_external_scanner_serialize(void *p, char *buffer) { return 0; }
 20
 21void tree_sitter_javascript_external_scanner_deserialize(void *p, const char *b, unsigned n) {}
 22
 23static void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
 24
 25static void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
 26
 27static bool scan_template_chars(TSLexer *lexer) {
 28    lexer->result_symbol = TEMPLATE_CHARS;
 29    for (bool has_content = false;; has_content = true) {
 30        lexer->mark_end(lexer);
 31        switch (lexer->lookahead) {
 32            case '`':
 33                return has_content;
 34            case '\0':
 35                return false;
 36            case '$':
 37                advance(lexer);
 38                if (lexer->lookahead == '{') {
 39                    return has_content;
 40                }
 41                break;
 42            case '\\':
 43                return has_content;
 44            default:
 45                advance(lexer);
 46        }
 47    }
 48}
 49
 50typedef enum {
 51    REJECT,     // Semicolon is illegal, ie a syntax error occurred
 52    NO_NEWLINE, // Unclear if semicolon will be legal, continue
 53    ACCEPT,     // Semicolon is legal, assuming a comment was encountered
 54} WhitespaceResult;
 55
 56/**
 57 * @param consume If false, only consume enough to check if comment indicates semicolon-legality
 58 */
 59static WhitespaceResult scan_whitespace_and_comments(TSLexer *lexer, bool *scanned_comment, bool consume) {
 60    bool saw_block_newline = false;
 61
 62    for (;;) {
 63        while (iswspace(lexer->lookahead)) {
 64            skip(lexer);
 65        }
 66
 67        if (lexer->lookahead == '/') {
 68            skip(lexer);
 69
 70            if (lexer->lookahead == '/') {
 71                skip(lexer);
 72                while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != 0x2028 &&
 73                       lexer->lookahead != 0x2029) {
 74                    skip(lexer);
 75                }
 76                *scanned_comment = true;
 77            } else if (lexer->lookahead == '*') {
 78                skip(lexer);
 79                while (lexer->lookahead != 0) {
 80                    if (lexer->lookahead == '*') {
 81                        skip(lexer);
 82                        if (lexer->lookahead == '/') {
 83                            skip(lexer);
 84                            *scanned_comment = true;
 85
 86                            if (lexer->lookahead != '/' && !consume) {
 87                                return saw_block_newline ? ACCEPT : NO_NEWLINE;
 88                            }
 89
 90                            break;
 91                        }
 92                    } else if (lexer->lookahead == '\n' || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) {
 93                        saw_block_newline = true;
 94                        skip(lexer);
 95                    } else {
 96                        skip(lexer);
 97                    }
 98                }
 99            } else {
100                return REJECT;
101            }
102        } else {
103            return ACCEPT;
104        }
105    }
106}
107
108static bool scan_automatic_semicolon(TSLexer *lexer, bool comment_condition, bool *scanned_comment) {
109    lexer->result_symbol = AUTOMATIC_SEMICOLON;
110    lexer->mark_end(lexer);
111
112    for (;;) {
113        if (lexer->lookahead == 0) {
114            return true;
115        }
116
117        if (lexer->lookahead == '/') {
118            WhitespaceResult result = scan_whitespace_and_comments(lexer, scanned_comment, false);
119            if (result == REJECT) {
120                return false;
121            }
122
123            if (result == ACCEPT && comment_condition && lexer->lookahead != ',' && lexer->lookahead != '=') {
124                return true;
125            }
126        }
127
128        if (lexer->lookahead == '}') {
129            return true;
130        }
131
132        if (lexer->is_at_included_range_start(lexer)) {
133            return true;
134        }
135
136        if (lexer->lookahead == '\n' || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) {
137            break;
138        }
139
140        if (!iswspace(lexer->lookahead)) {
141            return false;
142        }
143
144        skip(lexer);
145    }
146
147    skip(lexer);
148
149    if (scan_whitespace_and_comments(lexer, scanned_comment, true) == REJECT) {
150        return false;
151    }
152
153    switch (lexer->lookahead) {
154        case ',':
155        case ':':
156        case ';':
157        case '*':
158        case '%':
159        case '>':
160        case '<':
161        case '=':
162        case '[':
163        case '(':
164        case '?':
165        case '^':
166        case '|':
167        case '&':
168        case '/':
169            return false;
170
171        // Insert a semicolon before decimals literals but not otherwise.
172        case '.':
173            skip(lexer);
174            return iswdigit(lexer->lookahead);
175
176        // Insert a semicolon before `--` and `++`, but not before binary `+` or `-`.
177        case '+':
178            skip(lexer);
179            return lexer->lookahead == '+';
180        case '-':
181            skip(lexer);
182            return lexer->lookahead == '-';
183
184        // Don't insert a semicolon before `!=`, but do insert one before a unary `!`.
185        case '!':
186            skip(lexer);
187            return lexer->lookahead != '=';
188
189        // Don't insert a semicolon before `in` or `instanceof`, but do insert one
190        // before an identifier.
191        case 'i':
192            skip(lexer);
193
194            if (lexer->lookahead != 'n') {
195                return true;
196            }
197            skip(lexer);
198
199            if (!iswalpha(lexer->lookahead)) {
200                return false;
201            }
202
203            for (unsigned i = 0; i < 8; i++) {
204                if (lexer->lookahead != "stanceof"[i]) {
205                    return true;
206                }
207                skip(lexer);
208            }
209
210            if (!iswalpha(lexer->lookahead)) {
211                return false;
212            }
213            break;
214
215        default:
216            break;
217    }
218
219    return true;
220}
221
222static bool scan_ternary_qmark(TSLexer *lexer) {
223    for (;;) {
224        if (!iswspace(lexer->lookahead)) {
225            break;
226        }
227        skip(lexer);
228    }
229
230    if (lexer->lookahead == '?') {
231        advance(lexer);
232
233        if (lexer->lookahead == '?') {
234            return false;
235        }
236
237        lexer->mark_end(lexer);
238        lexer->result_symbol = TERNARY_QMARK;
239
240        if (lexer->lookahead == '.') {
241            advance(lexer);
242            if (iswdigit(lexer->lookahead)) {
243                return true;
244            }
245            return false;
246        }
247        return true;
248    }
249    return false;
250}
251
252static bool scan_html_comment(TSLexer *lexer) {
253    while (iswspace(lexer->lookahead) || lexer->lookahead == 0x2028 || lexer->lookahead == 0x2029) {
254        skip(lexer);
255    }
256
257    const char *comment_start = "<!--";
258    const char *comment_end = "-->";
259
260    if (lexer->lookahead == '<') {
261        for (unsigned i = 0; i < 4; i++) {
262            if (lexer->lookahead != comment_start[i]) {
263                return false;
264            }
265            advance(lexer);
266        }
267    } else if (lexer->lookahead == '-') {
268        for (unsigned i = 0; i < 3; i++) {
269            if (lexer->lookahead != comment_end[i]) {
270                return false;
271            }
272            advance(lexer);
273        }
274    } else {
275        return false;
276    }
277
278    while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != 0x2028 &&
279           lexer->lookahead != 0x2029) {
280        advance(lexer);
281    }
282
283    lexer->result_symbol = HTML_COMMENT;
284    lexer->mark_end(lexer);
285
286    return true;
287}
288
289bool tree_sitter_javascript_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
290    if (valid_symbols[TEMPLATE_CHARS]) {
291        if (valid_symbols[AUTOMATIC_SEMICOLON]) {
292            return false;
293        }
294        return scan_template_chars(lexer);
295    }
296
297    if (valid_symbols[AUTOMATIC_SEMICOLON]) {
298        bool scanned_comment = false;
299        bool ret = scan_automatic_semicolon(lexer, !valid_symbols[LOGICAL_OR], &scanned_comment);
300        if (!ret && !scanned_comment && valid_symbols[TERNARY_QMARK] && lexer->lookahead == '?') {
301            return scan_ternary_qmark(lexer);
302        }
303        return ret;
304    }
305
306    if (valid_symbols[TERNARY_QMARK]) {
307        return scan_ternary_qmark(lexer);
308    }
309
310    if (valid_symbols[HTML_COMMENT] && !valid_symbols[LOGICAL_OR] && !valid_symbols[ESCAPE_SEQUENCE] &&
311        !valid_symbols[REGEX_PATTERN]) {
312        return scan_html_comment(lexer);
313    }
314
315    return false;
316}