1#include "tag.h"
  2#include "parser.h"
  3
  4#include <wctype.h>
  5
  6enum TokenType {
  7    START_TAG_NAME,
  8    SCRIPT_START_TAG_NAME,
  9    STYLE_START_TAG_NAME,
 10    END_TAG_NAME,
 11    ERRONEOUS_END_TAG_NAME,
 12    SELF_CLOSING_TAG_DELIMITER,
 13    IMPLICIT_END_TAG,
 14    RAW_TEXT,
 15    COMMENT,
 16};
 17
 18typedef struct {
 19    Array(Tag) tags;
 20} Scanner;
 21
 22#define MAX(a, b) ((a) > (b) ? (a) : (b))
 23
 24static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
 25
 26static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
 27
 28static unsigned serialize(Scanner *scanner, char *buffer) {
 29    uint16_t tag_count = scanner->tags.size > UINT16_MAX ? UINT16_MAX : scanner->tags.size;
 30    uint16_t serialized_tag_count = 0;
 31
 32    unsigned size = sizeof(tag_count);
 33    memcpy(&buffer[size], &tag_count, sizeof(tag_count));
 34    size += sizeof(tag_count);
 35
 36    for (; serialized_tag_count < tag_count; serialized_tag_count++) {
 37        Tag tag = scanner->tags.contents[serialized_tag_count];
 38        if (tag.type == CUSTOM) {
 39            unsigned name_length = tag.custom_tag_name.size;
 40            if (name_length > UINT8_MAX) {
 41                name_length = UINT8_MAX;
 42            }
 43            if (size + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
 44                break;
 45            }
 46            buffer[size++] = (char)tag.type;
 47            buffer[size++] = (char)name_length;
 48            strncpy(&buffer[size], tag.custom_tag_name.contents, name_length);
 49            size += name_length;
 50        } else {
 51            if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
 52                break;
 53            }
 54            buffer[size++] = (char)tag.type;
 55        }
 56    }
 57
 58    memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
 59    return size;
 60}
 61
 62static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
 63    for (unsigned i = 0; i < scanner->tags.size; i++) {
 64        tag_free(&scanner->tags.contents[i]);
 65    }
 66    array_clear(&scanner->tags);
 67
 68    if (length > 0) {
 69        unsigned size = 0;
 70        uint16_t tag_count = 0;
 71        uint16_t serialized_tag_count = 0;
 72
 73        memcpy(&serialized_tag_count, &buffer[size], sizeof(serialized_tag_count));
 74        size += sizeof(serialized_tag_count);
 75
 76        memcpy(&tag_count, &buffer[size], sizeof(tag_count));
 77        size += sizeof(tag_count);
 78
 79        array_reserve(&scanner->tags, tag_count);
 80        if (tag_count > 0) {
 81            unsigned iter = 0;
 82            for (iter = 0; iter < serialized_tag_count; iter++) {
 83                Tag tag = tag_new();
 84                tag.type = (TagType)buffer[size++];
 85                if (tag.type == CUSTOM) {
 86                    uint16_t name_length = (uint8_t)buffer[size++];
 87                    array_reserve(&tag.custom_tag_name, name_length);
 88                    tag.custom_tag_name.size = name_length;
 89                    memcpy(tag.custom_tag_name.contents, &buffer[size], name_length);
 90                    size += name_length;
 91                }
 92                array_push(&scanner->tags, tag);
 93            }
 94            // add zero tags if we didn't read enough, this is because the
 95            // buffer had no more room but we held more tags.
 96            for (; iter < tag_count; iter++) {
 97                array_push(&scanner->tags, tag_new());
 98            }
 99        }
100    }
101}
102
103static String scan_tag_name(TSLexer *lexer) {
104    String tag_name = array_new();
105    while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
106        array_push(&tag_name, towupper(lexer->lookahead));
107        advance(lexer);
108    }
109    return tag_name;
110}
111
112static bool scan_comment(TSLexer *lexer) {
113    if (lexer->lookahead != '-') {
114        return false;
115    }
116    advance(lexer);
117    if (lexer->lookahead != '-') {
118        return false;
119    }
120    advance(lexer);
121
122    unsigned dashes = 0;
123    while (lexer->lookahead) {
124        switch (lexer->lookahead) {
125            case '-':
126                ++dashes;
127                break;
128            case '>':
129                if (dashes >= 2) {
130                    lexer->result_symbol = COMMENT;
131                    advance(lexer);
132                    lexer->mark_end(lexer);
133                    return true;
134                }
135            default:
136                dashes = 0;
137        }
138        advance(lexer);
139    }
140    return false;
141}
142
143static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
144    if (scanner->tags.size == 0) {
145        return false;
146    }
147
148    lexer->mark_end(lexer);
149
150    const char *end_delimiter = array_back(&scanner->tags)->type == SCRIPT ? "</SCRIPT" : "</STYLE";
151
152    unsigned delimiter_index = 0;
153    while (lexer->lookahead) {
154        if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
155            delimiter_index++;
156            if (delimiter_index == strlen(end_delimiter)) {
157                break;
158            }
159            advance(lexer);
160        } else {
161            delimiter_index = 0;
162            advance(lexer);
163            lexer->mark_end(lexer);
164        }
165    }
166
167    lexer->result_symbol = RAW_TEXT;
168    return true;
169}
170
171static void pop_tag(Scanner *scanner) {
172    Tag popped_tag = array_pop(&scanner->tags);
173    tag_free(&popped_tag);
174}
175
176static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
177    Tag *parent = scanner->tags.size == 0 ? NULL : array_back(&scanner->tags);
178
179    bool is_closing_tag = false;
180    if (lexer->lookahead == '/') {
181        is_closing_tag = true;
182        advance(lexer);
183    } else {
184        if (parent && tag_is_void(parent)) {
185            pop_tag(scanner);
186            lexer->result_symbol = IMPLICIT_END_TAG;
187            return true;
188        }
189    }
190
191    String tag_name = scan_tag_name(lexer);
192    if (tag_name.size == 0 && !lexer->eof(lexer)) {
193        array_delete(&tag_name);
194        return false;
195    }
196
197    Tag next_tag = tag_for_name(tag_name);
198
199    if (is_closing_tag) {
200        // The tag correctly closes the topmost element on the stack
201        if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &next_tag)) {
202            tag_free(&next_tag);
203            return false;
204        }
205
206        // Otherwise, dig deeper and queue implicit end tags (to be nice in
207        // the case of malformed HTML)
208        for (unsigned i = scanner->tags.size; i > 0; i--) {
209            if (scanner->tags.contents[i - 1].type == next_tag.type) {
210                pop_tag(scanner);
211                lexer->result_symbol = IMPLICIT_END_TAG;
212                tag_free(&next_tag);
213                return true;
214            }
215        }
216    } else if (
217        parent &&
218        (
219            !tag_can_contain(parent, &next_tag) ||
220            ((parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer))
221        )
222    ) {
223        pop_tag(scanner);
224        lexer->result_symbol = IMPLICIT_END_TAG;
225        tag_free(&next_tag);
226        return true;
227    }
228
229    tag_free(&next_tag);
230    return false;
231}
232
233static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
234    String tag_name = scan_tag_name(lexer);
235    if (tag_name.size == 0) {
236        array_delete(&tag_name);
237        return false;
238    }
239
240    Tag tag = tag_for_name(tag_name);
241    array_push(&scanner->tags, tag);
242    switch (tag.type) {
243        case SCRIPT:
244            lexer->result_symbol = SCRIPT_START_TAG_NAME;
245            break;
246        case STYLE:
247            lexer->result_symbol = STYLE_START_TAG_NAME;
248            break;
249        default:
250            lexer->result_symbol = START_TAG_NAME;
251            break;
252    }
253    return true;
254}
255
256static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
257    String tag_name = scan_tag_name(lexer);
258
259    if (tag_name.size == 0) {
260        array_delete(&tag_name);
261        return false;
262    }
263
264    Tag tag = tag_for_name(tag_name);
265    if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &tag)) {
266        pop_tag(scanner);
267        lexer->result_symbol = END_TAG_NAME;
268    } else {
269        lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
270    }
271
272    tag_free(&tag);
273    return true;
274}
275
276static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
277    advance(lexer);
278    if (lexer->lookahead == '>') {
279        advance(lexer);
280        if (scanner->tags.size > 0) {
281            pop_tag(scanner);
282            lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
283        }
284        return true;
285    }
286    return false;
287}
288
289static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
290    if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
291        return scan_raw_text(scanner, lexer);
292    }
293
294    while (iswspace(lexer->lookahead)) {
295        skip(lexer);
296    }
297
298    switch (lexer->lookahead) {
299        case '<':
300            lexer->mark_end(lexer);
301            advance(lexer);
302
303            if (lexer->lookahead == '!') {
304                advance(lexer);
305                return scan_comment(lexer);
306            }
307
308            if (valid_symbols[IMPLICIT_END_TAG]) {
309                return scan_implicit_end_tag(scanner, lexer);
310            }
311            break;
312
313        case '\0':
314            if (valid_symbols[IMPLICIT_END_TAG]) {
315                return scan_implicit_end_tag(scanner, lexer);
316            }
317            break;
318
319        case '/':
320            if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
321                return scan_self_closing_tag_delimiter(scanner, lexer);
322            }
323            break;
324
325        default:
326            if ((valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) && !valid_symbols[RAW_TEXT]) {
327                return valid_symbols[START_TAG_NAME] ? scan_start_tag_name(scanner, lexer)
328                                                     : scan_end_tag_name(scanner, lexer);
329            }
330    }
331
332    return false;
333}
334
335void *tree_sitter_html_external_scanner_create() {
336    Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
337    return scanner;
338}
339
340bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
341    Scanner *scanner = (Scanner *)payload;
342    return scan(scanner, lexer, valid_symbols);
343}
344
345unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
346    Scanner *scanner = (Scanner *)payload;
347    return serialize(scanner, buffer);
348}
349
350void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
351    Scanner *scanner = (Scanner *)payload;
352    deserialize(scanner, buffer, length);
353}
354
355void tree_sitter_html_external_scanner_destroy(void *payload) {
356    Scanner *scanner = (Scanner *)payload;
357    for (unsigned i = 0; i < scanner->tags.size; i++) {
358        tag_free(&scanner->tags.contents[i]);
359    }
360    array_delete(&scanner->tags);
361    ts_free(scanner);
362}