aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c')
-rw-r--r--vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c419
1 files changed, 419 insertions, 0 deletions
diff --git a/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c b/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c
new file mode 100644
index 0000000..d108c04
--- /dev/null
+++ b/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c
@@ -0,0 +1,419 @@
1#include <stdio.h>
2#include "./lexer.h"
3#include "./subtree.h"
4#include "./length.h"
5#include "./unicode.h"
6
7#define LOG(message, character) \
8 if (self->logger.log) { \
9 snprintf( \
10 self->debug_buffer, \
11 TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
12 32 <= character && character < 127 ? \
13 message " character:'%c'" : \
14 message " character:%d", \
15 character \
16 ); \
17 self->logger.log( \
18 self->logger.payload, \
19 TSLogTypeLex, \
20 self->debug_buffer \
21 ); \
22 }
23
24static const int32_t BYTE_ORDER_MARK = 0xFEFF;
25
26static const TSRange DEFAULT_RANGE = {
27 .start_point = {
28 .row = 0,
29 .column = 0,
30 },
31 .end_point = {
32 .row = UINT32_MAX,
33 .column = UINT32_MAX,
34 },
35 .start_byte = 0,
36 .end_byte = UINT32_MAX
37};
38
39// Check if the lexer has reached EOF. This state is stored
40// by setting the lexer's `current_included_range_index` such that
41// it has consumed all of its available ranges.
42static bool ts_lexer__eof(const TSLexer *_self) {
43 Lexer *self = (Lexer *)_self;
44 return self->current_included_range_index == self->included_range_count;
45}
46
47// Clear the currently stored chunk of source code, because the lexer's
48// position has changed.
49static void ts_lexer__clear_chunk(Lexer *self) {
50 self->chunk = NULL;
51 self->chunk_size = 0;
52 self->chunk_start = 0;
53}
54
55// Call the lexer's input callback to obtain a new chunk of source code
56// for the current position.
57static void ts_lexer__get_chunk(Lexer *self) {
58 self->chunk_start = self->current_position.bytes;
59 self->chunk = self->input.read(
60 self->input.payload,
61 self->current_position.bytes,
62 self->current_position.extent,
63 &self->chunk_size
64 );
65 if (!self->chunk_size) {
66 self->current_included_range_index = self->included_range_count;
67 self->chunk = NULL;
68 }
69}
70
71// Decode the next unicode character in the current chunk of source code.
72// This assumes that the lexer has already retrieved a chunk of source
73// code that spans the current position.
74static void ts_lexer__get_lookahead(Lexer *self) {
75 uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
76 uint32_t size = self->chunk_size - position_in_chunk;
77
78 if (size == 0) {
79 self->lookahead_size = 1;
80 self->data.lookahead = '\0';
81 return;
82 }
83
84 const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
85 UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
86 ? ts_decode_utf8
87 : ts_decode_utf16;
88
89 self->lookahead_size = decode(chunk, size, &self->data.lookahead);
90
91 // If this chunk ended in the middle of a multi-byte character,
92 // try again with a fresh chunk.
93 if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
94 ts_lexer__get_chunk(self);
95 chunk = (const uint8_t *)self->chunk;
96 size = self->chunk_size;
97 self->lookahead_size = decode(chunk, size, &self->data.lookahead);
98 }
99
100 if (self->data.lookahead == TS_DECODE_ERROR) {
101 self->lookahead_size = 1;
102 }
103}
104
105static void ts_lexer_goto(Lexer *self, Length position) {
106 self->current_position = position;
107
108 // Move to the first valid position at or after the given position.
109 bool found_included_range = false;
110 for (unsigned i = 0; i < self->included_range_count; i++) {
111 TSRange *included_range = &self->included_ranges[i];
112 if (
113 included_range->end_byte > self->current_position.bytes &&
114 included_range->end_byte > included_range->start_byte
115 ) {
116 if (included_range->start_byte >= self->current_position.bytes) {
117 self->current_position = (Length) {
118 .bytes = included_range->start_byte,
119 .extent = included_range->start_point,
120 };
121 }
122
123 self->current_included_range_index = i;
124 found_included_range = true;
125 break;
126 }
127 }
128
129 if (found_included_range) {
130 // If the current position is outside of the current chunk of text,
131 // then clear out the current chunk of text.
132 if (self->chunk && (
133 self->current_position.bytes < self->chunk_start ||
134 self->current_position.bytes >= self->chunk_start + self->chunk_size
135 )) {
136 ts_lexer__clear_chunk(self);
137 }
138
139 self->lookahead_size = 0;
140 self->data.lookahead = '\0';
141 }
142
143 // If the given position is beyond any of included ranges, move to the EOF
144 // state - past the end of the included ranges.
145 else {
146 self->current_included_range_index = self->included_range_count;
147 TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
148 self->current_position = (Length) {
149 .bytes = last_included_range->end_byte,
150 .extent = last_included_range->end_point,
151 };
152 ts_lexer__clear_chunk(self);
153 self->lookahead_size = 1;
154 self->data.lookahead = '\0';
155 }
156}
157
158// Intended to be called only from functions that control logging.
159static void ts_lexer__do_advance(Lexer *self, bool skip) {
160 if (self->lookahead_size) {
161 self->current_position.bytes += self->lookahead_size;
162 if (self->data.lookahead == '\n') {
163 self->current_position.extent.row++;
164 self->current_position.extent.column = 0;
165 } else {
166 self->current_position.extent.column += self->lookahead_size;
167 }
168 }
169
170 const TSRange *current_range = &self->included_ranges[self->current_included_range_index];
171 while (
172 self->current_position.bytes >= current_range->end_byte ||
173 current_range->end_byte == current_range->start_byte
174 ) {
175 if (self->current_included_range_index < self->included_range_count) {
176 self->current_included_range_index++;
177 }
178 if (self->current_included_range_index < self->included_range_count) {
179 current_range++;
180 self->current_position = (Length) {
181 current_range->start_byte,
182 current_range->start_point,
183 };
184 } else {
185 current_range = NULL;
186 break;
187 }
188 }
189
190 if (skip) self->token_start_position = self->current_position;
191
192 if (current_range) {
193 if (
194 self->current_position.bytes < self->chunk_start ||
195 self->current_position.bytes >= self->chunk_start + self->chunk_size
196 ) {
197 ts_lexer__get_chunk(self);
198 }
199 ts_lexer__get_lookahead(self);
200 } else {
201 ts_lexer__clear_chunk(self);
202 self->data.lookahead = '\0';
203 self->lookahead_size = 1;
204 }
205}
206
207// Advance to the next character in the source code, retrieving a new
208// chunk of source code if needed.
209static void ts_lexer__advance(TSLexer *_self, bool skip) {
210 Lexer *self = (Lexer *)_self;
211 if (!self->chunk) return;
212
213 if (skip) {
214 LOG("skip", self->data.lookahead)
215 } else {
216 LOG("consume", self->data.lookahead)
217 }
218
219 ts_lexer__do_advance(self, skip);
220}
221
222// Mark that a token match has completed. This can be called multiple
223// times if a longer match is found later.
224static void ts_lexer__mark_end(TSLexer *_self) {
225 Lexer *self = (Lexer *)_self;
226 if (!ts_lexer__eof(&self->data)) {
227 // If the lexer is right at the beginning of included range,
228 // then the token should be considered to end at the *end* of the
229 // previous included range, rather than here.
230 TSRange *current_included_range = &self->included_ranges[
231 self->current_included_range_index
232 ];
233 if (
234 self->current_included_range_index > 0 &&
235 self->current_position.bytes == current_included_range->start_byte
236 ) {
237 TSRange *previous_included_range = current_included_range - 1;
238 self->token_end_position = (Length) {
239 previous_included_range->end_byte,
240 previous_included_range->end_point,
241 };
242 return;
243 }
244 }
245 self->token_end_position = self->current_position;
246}
247
248static uint32_t ts_lexer__get_column(TSLexer *_self) {
249 Lexer *self = (Lexer *)_self;
250
251 uint32_t goal_byte = self->current_position.bytes;
252
253 self->did_get_column = true;
254 self->current_position.bytes -= self->current_position.extent.column;
255 self->current_position.extent.column = 0;
256
257 if (self->current_position.bytes < self->chunk_start) {
258 ts_lexer__get_chunk(self);
259 }
260
261 uint32_t result = 0;
262 if (!ts_lexer__eof(_self)) {
263 ts_lexer__get_lookahead(self);
264 while (self->current_position.bytes < goal_byte && self->chunk) {
265 result++;
266 ts_lexer__do_advance(self, false);
267 if (ts_lexer__eof(_self)) break;
268 }
269 }
270
271 return result;
272}
273
274// Is the lexer at a boundary between two disjoint included ranges of
275// source code? This is exposed as an API because some languages' external
276// scanners need to perform custom actions at these boundaries.
277static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
278 const Lexer *self = (const Lexer *)_self;
279 if (self->current_included_range_index < self->included_range_count) {
280 TSRange *current_range = &self->included_ranges[self->current_included_range_index];
281 return self->current_position.bytes == current_range->start_byte;
282 } else {
283 return false;
284 }
285}
286
287void ts_lexer_init(Lexer *self) {
288 *self = (Lexer) {
289 .data = {
290 // The lexer's methods are stored as struct fields so that generated
291 // parsers can call them without needing to be linked against this
292 // library.
293 .advance = ts_lexer__advance,
294 .mark_end = ts_lexer__mark_end,
295 .get_column = ts_lexer__get_column,
296 .is_at_included_range_start = ts_lexer__is_at_included_range_start,
297 .eof = ts_lexer__eof,
298 .lookahead = 0,
299 .result_symbol = 0,
300 },
301 .chunk = NULL,
302 .chunk_size = 0,
303 .chunk_start = 0,
304 .current_position = {0, {0, 0}},
305 .logger = {
306 .payload = NULL,
307 .log = NULL
308 },
309 .included_ranges = NULL,
310 .included_range_count = 0,
311 .current_included_range_index = 0,
312 };
313 ts_lexer_set_included_ranges(self, NULL, 0);
314}
315
316void ts_lexer_delete(Lexer *self) {
317 ts_free(self->included_ranges);
318}
319
320void ts_lexer_set_input(Lexer *self, TSInput input) {
321 self->input = input;
322 ts_lexer__clear_chunk(self);
323 ts_lexer_goto(self, self->current_position);
324}
325
326// Move the lexer to the given position. This doesn't do any work
327// if the parser is already at the given position.
328void ts_lexer_reset(Lexer *self, Length position) {
329 if (position.bytes != self->current_position.bytes) {
330 ts_lexer_goto(self, position);
331 }
332}
333
334void ts_lexer_start(Lexer *self) {
335 self->token_start_position = self->current_position;
336 self->token_end_position = LENGTH_UNDEFINED;
337 self->data.result_symbol = 0;
338 self->did_get_column = false;
339 if (!ts_lexer__eof(&self->data)) {
340 if (!self->chunk_size) ts_lexer__get_chunk(self);
341 if (!self->lookahead_size) ts_lexer__get_lookahead(self);
342 if (
343 self->current_position.bytes == 0 &&
344 self->data.lookahead == BYTE_ORDER_MARK
345 ) ts_lexer__advance(&self->data, true);
346 }
347}
348
349void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
350 if (length_is_undefined(self->token_end_position)) {
351 ts_lexer__mark_end(&self->data);
352 }
353
354 // If the token ended at an included range boundary, then its end position
355 // will have been reset to the end of the preceding range. Reset the start
356 // position to match.
357 if (self->token_end_position.bytes < self->token_start_position.bytes) {
358 self->token_start_position = self->token_end_position;
359 }
360
361 uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;
362
363 // In order to determine that a byte sequence is invalid UTF8 or UTF16,
364 // the character decoding algorithm may have looked at the following byte.
365 // Therefore, the next byte *after* the current (invalid) character
366 // affects the interpretation of the current character.
367 if (self->data.lookahead == TS_DECODE_ERROR) {
368 current_lookahead_end_byte++;
369 }
370
371 if (current_lookahead_end_byte > *lookahead_end_byte) {
372 *lookahead_end_byte = current_lookahead_end_byte;
373 }
374}
375
376void ts_lexer_advance_to_end(Lexer *self) {
377 while (self->chunk) {
378 ts_lexer__advance(&self->data, false);
379 }
380}
381
382void ts_lexer_mark_end(Lexer *self) {
383 ts_lexer__mark_end(&self->data);
384}
385
386bool ts_lexer_set_included_ranges(
387 Lexer *self,
388 const TSRange *ranges,
389 uint32_t count
390) {
391 if (count == 0 || !ranges) {
392 ranges = &DEFAULT_RANGE;
393 count = 1;
394 } else {
395 uint32_t previous_byte = 0;
396 for (unsigned i = 0; i < count; i++) {
397 const TSRange *range = &ranges[i];
398 if (
399 range->start_byte < previous_byte ||
400 range->end_byte < range->start_byte
401 ) return false;
402 previous_byte = range->end_byte;
403 }
404 }
405
406 size_t size = count * sizeof(TSRange);
407 self->included_ranges = ts_realloc(self->included_ranges, size);
408 memcpy(self->included_ranges, ranges, size);
409 self->included_range_count = count;
410 ts_lexer_goto(self, self->current_position);
411 return true;
412}
413
414TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) {
415 *count = self->included_range_count;
416 return self->included_ranges;
417}
418
419#undef LOG