aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-01-21 20:22:09 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-01-21 20:22:09 +0100
commit5a8dbc6347b3541e84fe669b22c17ad3b715e258 (patch)
treeb148c450939688caaaeb4adac6f2faa1eaffe649 /vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c
downloadqwe-editor-5a8dbc6347b3541e84fe669b22c17ad3b715e258.tar.gz
Engage!
Diffstat (limited to 'vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c')
-rw-r--r--vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c362
1 files changed, 362 insertions, 0 deletions
diff --git a/vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c b/vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c
new file mode 100644
index 0000000..d14ba9b
--- /dev/null
+++ b/vendor/github.com/mitjafelicijan/go-tree-sitter/html/scanner.c
@@ -0,0 +1,362 @@
1#include "tag.h"
2#include "parser.h"
3
4#include <wctype.h>
5
6enum TokenType {
7 START_TAG_NAME,
8 SCRIPT_START_TAG_NAME,
9 STYLE_START_TAG_NAME,
10 END_TAG_NAME,
11 ERRONEOUS_END_TAG_NAME,
12 SELF_CLOSING_TAG_DELIMITER,
13 IMPLICIT_END_TAG,
14 RAW_TEXT,
15 COMMENT,
16};
17
18typedef struct {
19 Array(Tag) tags;
20} Scanner;
21
22#define MAX(a, b) ((a) > (b) ? (a) : (b))
23
24static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
25
26static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
27
28static unsigned serialize(Scanner *scanner, char *buffer) {
29 uint16_t tag_count = scanner->tags.size > UINT16_MAX ? UINT16_MAX : scanner->tags.size;
30 uint16_t serialized_tag_count = 0;
31
32 unsigned size = sizeof(tag_count);
33 memcpy(&buffer[size], &tag_count, sizeof(tag_count));
34 size += sizeof(tag_count);
35
36 for (; serialized_tag_count < tag_count; serialized_tag_count++) {
37 Tag tag = scanner->tags.contents[serialized_tag_count];
38 if (tag.type == CUSTOM) {
39 unsigned name_length = tag.custom_tag_name.size;
40 if (name_length > UINT8_MAX) {
41 name_length = UINT8_MAX;
42 }
43 if (size + 2 + name_length >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
44 break;
45 }
46 buffer[size++] = (char)tag.type;
47 buffer[size++] = (char)name_length;
48 strncpy(&buffer[size], tag.custom_tag_name.contents, name_length);
49 size += name_length;
50 } else {
51 if (size + 1 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
52 break;
53 }
54 buffer[size++] = (char)tag.type;
55 }
56 }
57
58 memcpy(&buffer[0], &serialized_tag_count, sizeof(serialized_tag_count));
59 return size;
60}
61
62static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
63 for (unsigned i = 0; i < scanner->tags.size; i++) {
64 tag_free(&scanner->tags.contents[i]);
65 }
66 array_clear(&scanner->tags);
67
68 if (length > 0) {
69 unsigned size = 0;
70 uint16_t tag_count = 0;
71 uint16_t serialized_tag_count = 0;
72
73 memcpy(&serialized_tag_count, &buffer[size], sizeof(serialized_tag_count));
74 size += sizeof(serialized_tag_count);
75
76 memcpy(&tag_count, &buffer[size], sizeof(tag_count));
77 size += sizeof(tag_count);
78
79 array_reserve(&scanner->tags, tag_count);
80 if (tag_count > 0) {
81 unsigned iter = 0;
82 for (iter = 0; iter < serialized_tag_count; iter++) {
83 Tag tag = tag_new();
84 tag.type = (TagType)buffer[size++];
85 if (tag.type == CUSTOM) {
86 uint16_t name_length = (uint8_t)buffer[size++];
87 array_reserve(&tag.custom_tag_name, name_length);
88 tag.custom_tag_name.size = name_length;
89 memcpy(tag.custom_tag_name.contents, &buffer[size], name_length);
90 size += name_length;
91 }
92 array_push(&scanner->tags, tag);
93 }
94 // add zero tags if we didn't read enough, this is because the
95 // buffer had no more room but we held more tags.
96 for (; iter < tag_count; iter++) {
97 array_push(&scanner->tags, tag_new());
98 }
99 }
100 }
101}
102
103static String scan_tag_name(TSLexer *lexer) {
104 String tag_name = array_new();
105 while (iswalnum(lexer->lookahead) || lexer->lookahead == '-' || lexer->lookahead == ':') {
106 array_push(&tag_name, towupper(lexer->lookahead));
107 advance(lexer);
108 }
109 return tag_name;
110}
111
112static bool scan_comment(TSLexer *lexer) {
113 if (lexer->lookahead != '-') {
114 return false;
115 }
116 advance(lexer);
117 if (lexer->lookahead != '-') {
118 return false;
119 }
120 advance(lexer);
121
122 unsigned dashes = 0;
123 while (lexer->lookahead) {
124 switch (lexer->lookahead) {
125 case '-':
126 ++dashes;
127 break;
128 case '>':
129 if (dashes >= 2) {
130 lexer->result_symbol = COMMENT;
131 advance(lexer);
132 lexer->mark_end(lexer);
133 return true;
134 }
135 default:
136 dashes = 0;
137 }
138 advance(lexer);
139 }
140 return false;
141}
142
143static bool scan_raw_text(Scanner *scanner, TSLexer *lexer) {
144 if (scanner->tags.size == 0) {
145 return false;
146 }
147
148 lexer->mark_end(lexer);
149
150 const char *end_delimiter = array_back(&scanner->tags)->type == SCRIPT ? "</SCRIPT" : "</STYLE";
151
152 unsigned delimiter_index = 0;
153 while (lexer->lookahead) {
154 if (towupper(lexer->lookahead) == end_delimiter[delimiter_index]) {
155 delimiter_index++;
156 if (delimiter_index == strlen(end_delimiter)) {
157 break;
158 }
159 advance(lexer);
160 } else {
161 delimiter_index = 0;
162 advance(lexer);
163 lexer->mark_end(lexer);
164 }
165 }
166
167 lexer->result_symbol = RAW_TEXT;
168 return true;
169}
170
171static void pop_tag(Scanner *scanner) {
172 Tag popped_tag = array_pop(&scanner->tags);
173 tag_free(&popped_tag);
174}
175
176static bool scan_implicit_end_tag(Scanner *scanner, TSLexer *lexer) {
177 Tag *parent = scanner->tags.size == 0 ? NULL : array_back(&scanner->tags);
178
179 bool is_closing_tag = false;
180 if (lexer->lookahead == '/') {
181 is_closing_tag = true;
182 advance(lexer);
183 } else {
184 if (parent && tag_is_void(parent)) {
185 pop_tag(scanner);
186 lexer->result_symbol = IMPLICIT_END_TAG;
187 return true;
188 }
189 }
190
191 String tag_name = scan_tag_name(lexer);
192 if (tag_name.size == 0 && !lexer->eof(lexer)) {
193 array_delete(&tag_name);
194 return false;
195 }
196
197 Tag next_tag = tag_for_name(tag_name);
198
199 if (is_closing_tag) {
200 // The tag correctly closes the topmost element on the stack
201 if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &next_tag)) {
202 tag_free(&next_tag);
203 return false;
204 }
205
206 // Otherwise, dig deeper and queue implicit end tags (to be nice in
207 // the case of malformed HTML)
208 for (unsigned i = scanner->tags.size; i > 0; i--) {
209 if (scanner->tags.contents[i - 1].type == next_tag.type) {
210 pop_tag(scanner);
211 lexer->result_symbol = IMPLICIT_END_TAG;
212 tag_free(&next_tag);
213 return true;
214 }
215 }
216 } else if (
217 parent &&
218 (
219 !tag_can_contain(parent, &next_tag) ||
220 ((parent->type == HTML || parent->type == HEAD || parent->type == BODY) && lexer->eof(lexer))
221 )
222 ) {
223 pop_tag(scanner);
224 lexer->result_symbol = IMPLICIT_END_TAG;
225 tag_free(&next_tag);
226 return true;
227 }
228
229 tag_free(&next_tag);
230 return false;
231}
232
233static bool scan_start_tag_name(Scanner *scanner, TSLexer *lexer) {
234 String tag_name = scan_tag_name(lexer);
235 if (tag_name.size == 0) {
236 array_delete(&tag_name);
237 return false;
238 }
239
240 Tag tag = tag_for_name(tag_name);
241 array_push(&scanner->tags, tag);
242 switch (tag.type) {
243 case SCRIPT:
244 lexer->result_symbol = SCRIPT_START_TAG_NAME;
245 break;
246 case STYLE:
247 lexer->result_symbol = STYLE_START_TAG_NAME;
248 break;
249 default:
250 lexer->result_symbol = START_TAG_NAME;
251 break;
252 }
253 return true;
254}
255
256static bool scan_end_tag_name(Scanner *scanner, TSLexer *lexer) {
257 String tag_name = scan_tag_name(lexer);
258
259 if (tag_name.size == 0) {
260 array_delete(&tag_name);
261 return false;
262 }
263
264 Tag tag = tag_for_name(tag_name);
265 if (scanner->tags.size > 0 && tag_eq(array_back(&scanner->tags), &tag)) {
266 pop_tag(scanner);
267 lexer->result_symbol = END_TAG_NAME;
268 } else {
269 lexer->result_symbol = ERRONEOUS_END_TAG_NAME;
270 }
271
272 tag_free(&tag);
273 return true;
274}
275
276static bool scan_self_closing_tag_delimiter(Scanner *scanner, TSLexer *lexer) {
277 advance(lexer);
278 if (lexer->lookahead == '>') {
279 advance(lexer);
280 if (scanner->tags.size > 0) {
281 pop_tag(scanner);
282 lexer->result_symbol = SELF_CLOSING_TAG_DELIMITER;
283 }
284 return true;
285 }
286 return false;
287}
288
289static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
290 if (valid_symbols[RAW_TEXT] && !valid_symbols[START_TAG_NAME] && !valid_symbols[END_TAG_NAME]) {
291 return scan_raw_text(scanner, lexer);
292 }
293
294 while (iswspace(lexer->lookahead)) {
295 skip(lexer);
296 }
297
298 switch (lexer->lookahead) {
299 case '<':
300 lexer->mark_end(lexer);
301 advance(lexer);
302
303 if (lexer->lookahead == '!') {
304 advance(lexer);
305 return scan_comment(lexer);
306 }
307
308 if (valid_symbols[IMPLICIT_END_TAG]) {
309 return scan_implicit_end_tag(scanner, lexer);
310 }
311 break;
312
313 case '\0':
314 if (valid_symbols[IMPLICIT_END_TAG]) {
315 return scan_implicit_end_tag(scanner, lexer);
316 }
317 break;
318
319 case '/':
320 if (valid_symbols[SELF_CLOSING_TAG_DELIMITER]) {
321 return scan_self_closing_tag_delimiter(scanner, lexer);
322 }
323 break;
324
325 default:
326 if ((valid_symbols[START_TAG_NAME] || valid_symbols[END_TAG_NAME]) && !valid_symbols[RAW_TEXT]) {
327 return valid_symbols[START_TAG_NAME] ? scan_start_tag_name(scanner, lexer)
328 : scan_end_tag_name(scanner, lexer);
329 }
330 }
331
332 return false;
333}
334
335void *tree_sitter_html_external_scanner_create() {
336 Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
337 return scanner;
338}
339
340bool tree_sitter_html_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
341 Scanner *scanner = (Scanner *)payload;
342 return scan(scanner, lexer, valid_symbols);
343}
344
345unsigned tree_sitter_html_external_scanner_serialize(void *payload, char *buffer) {
346 Scanner *scanner = (Scanner *)payload;
347 return serialize(scanner, buffer);
348}
349
350void tree_sitter_html_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
351 Scanner *scanner = (Scanner *)payload;
352 deserialize(scanner, buffer, length);
353}
354
355void tree_sitter_html_external_scanner_destroy(void *payload) {
356 Scanner *scanner = (Scanner *)payload;
357 for (unsigned i = 0; i < scanner->tags.size; i++) {
358 tag_free(&scanner->tags.contents[i]);
359 }
360 array_delete(&scanner->tags);
361 ts_free(scanner);
362}