diff options
Diffstat (limited to 'vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c')
| -rw-r--r-- | vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c | 419 |
1 files changed, 419 insertions, 0 deletions
diff --git a/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c b/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c new file mode 100644 index 0000000..d108c04 --- /dev/null +++ b/vendor/github.com/mitjafelicijan/go-tree-sitter/lexer.c | |||
| @@ -0,0 +1,419 @@ | |||
| 1 | #include <stdio.h> | ||
| 2 | #include "./lexer.h" | ||
| 3 | #include "./subtree.h" | ||
| 4 | #include "./length.h" | ||
| 5 | #include "./unicode.h" | ||
| 6 | |||
| 7 | #define LOG(message, character) \ | ||
| 8 | if (self->logger.log) { \ | ||
| 9 | snprintf( \ | ||
| 10 | self->debug_buffer, \ | ||
| 11 | TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \ | ||
| 12 | 32 <= character && character < 127 ? \ | ||
| 13 | message " character:'%c'" : \ | ||
| 14 | message " character:%d", \ | ||
| 15 | character \ | ||
| 16 | ); \ | ||
| 17 | self->logger.log( \ | ||
| 18 | self->logger.payload, \ | ||
| 19 | TSLogTypeLex, \ | ||
| 20 | self->debug_buffer \ | ||
| 21 | ); \ | ||
| 22 | } | ||
| 23 | |||
| 24 | static const int32_t BYTE_ORDER_MARK = 0xFEFF; | ||
| 25 | |||
| 26 | static const TSRange DEFAULT_RANGE = { | ||
| 27 | .start_point = { | ||
| 28 | .row = 0, | ||
| 29 | .column = 0, | ||
| 30 | }, | ||
| 31 | .end_point = { | ||
| 32 | .row = UINT32_MAX, | ||
| 33 | .column = UINT32_MAX, | ||
| 34 | }, | ||
| 35 | .start_byte = 0, | ||
| 36 | .end_byte = UINT32_MAX | ||
| 37 | }; | ||
| 38 | |||
| 39 | // Check if the lexer has reached EOF. This state is stored | ||
| 40 | // by setting the lexer's `current_included_range_index` such that | ||
| 41 | // it has consumed all of its available ranges. | ||
| 42 | static bool ts_lexer__eof(const TSLexer *_self) { | ||
| 43 | Lexer *self = (Lexer *)_self; | ||
| 44 | return self->current_included_range_index == self->included_range_count; | ||
| 45 | } | ||
| 46 | |||
| 47 | // Clear the currently stored chunk of source code, because the lexer's | ||
| 48 | // position has changed. | ||
| 49 | static void ts_lexer__clear_chunk(Lexer *self) { | ||
| 50 | self->chunk = NULL; | ||
| 51 | self->chunk_size = 0; | ||
| 52 | self->chunk_start = 0; | ||
| 53 | } | ||
| 54 | |||
| 55 | // Call the lexer's input callback to obtain a new chunk of source code | ||
| 56 | // for the current position. | ||
| 57 | static void ts_lexer__get_chunk(Lexer *self) { | ||
| 58 | self->chunk_start = self->current_position.bytes; | ||
| 59 | self->chunk = self->input.read( | ||
| 60 | self->input.payload, | ||
| 61 | self->current_position.bytes, | ||
| 62 | self->current_position.extent, | ||
| 63 | &self->chunk_size | ||
| 64 | ); | ||
| 65 | if (!self->chunk_size) { | ||
| 66 | self->current_included_range_index = self->included_range_count; | ||
| 67 | self->chunk = NULL; | ||
| 68 | } | ||
| 69 | } | ||
| 70 | |||
| 71 | // Decode the next unicode character in the current chunk of source code. | ||
| 72 | // This assumes that the lexer has already retrieved a chunk of source | ||
| 73 | // code that spans the current position. | ||
| 74 | static void ts_lexer__get_lookahead(Lexer *self) { | ||
| 75 | uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start; | ||
| 76 | uint32_t size = self->chunk_size - position_in_chunk; | ||
| 77 | |||
| 78 | if (size == 0) { | ||
| 79 | self->lookahead_size = 1; | ||
| 80 | self->data.lookahead = '\0'; | ||
| 81 | return; | ||
| 82 | } | ||
| 83 | |||
| 84 | const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk; | ||
| 85 | UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8 | ||
| 86 | ? ts_decode_utf8 | ||
| 87 | : ts_decode_utf16; | ||
| 88 | |||
| 89 | self->lookahead_size = decode(chunk, size, &self->data.lookahead); | ||
| 90 | |||
| 91 | // If this chunk ended in the middle of a multi-byte character, | ||
| 92 | // try again with a fresh chunk. | ||
| 93 | if (self->data.lookahead == TS_DECODE_ERROR && size < 4) { | ||
| 94 | ts_lexer__get_chunk(self); | ||
| 95 | chunk = (const uint8_t *)self->chunk; | ||
| 96 | size = self->chunk_size; | ||
| 97 | self->lookahead_size = decode(chunk, size, &self->data.lookahead); | ||
| 98 | } | ||
| 99 | |||
| 100 | if (self->data.lookahead == TS_DECODE_ERROR) { | ||
| 101 | self->lookahead_size = 1; | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | static void ts_lexer_goto(Lexer *self, Length position) { | ||
| 106 | self->current_position = position; | ||
| 107 | |||
| 108 | // Move to the first valid position at or after the given position. | ||
| 109 | bool found_included_range = false; | ||
| 110 | for (unsigned i = 0; i < self->included_range_count; i++) { | ||
| 111 | TSRange *included_range = &self->included_ranges[i]; | ||
| 112 | if ( | ||
| 113 | included_range->end_byte > self->current_position.bytes && | ||
| 114 | included_range->end_byte > included_range->start_byte | ||
| 115 | ) { | ||
| 116 | if (included_range->start_byte >= self->current_position.bytes) { | ||
| 117 | self->current_position = (Length) { | ||
| 118 | .bytes = included_range->start_byte, | ||
| 119 | .extent = included_range->start_point, | ||
| 120 | }; | ||
| 121 | } | ||
| 122 | |||
| 123 | self->current_included_range_index = i; | ||
| 124 | found_included_range = true; | ||
| 125 | break; | ||
| 126 | } | ||
| 127 | } | ||
| 128 | |||
| 129 | if (found_included_range) { | ||
| 130 | // If the current position is outside of the current chunk of text, | ||
| 131 | // then clear out the current chunk of text. | ||
| 132 | if (self->chunk && ( | ||
| 133 | self->current_position.bytes < self->chunk_start || | ||
| 134 | self->current_position.bytes >= self->chunk_start + self->chunk_size | ||
| 135 | )) { | ||
| 136 | ts_lexer__clear_chunk(self); | ||
| 137 | } | ||
| 138 | |||
| 139 | self->lookahead_size = 0; | ||
| 140 | self->data.lookahead = '\0'; | ||
| 141 | } | ||
| 142 | |||
| 143 | // If the given position is beyond any of included ranges, move to the EOF | ||
| 144 | // state - past the end of the included ranges. | ||
| 145 | else { | ||
| 146 | self->current_included_range_index = self->included_range_count; | ||
| 147 | TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1]; | ||
| 148 | self->current_position = (Length) { | ||
| 149 | .bytes = last_included_range->end_byte, | ||
| 150 | .extent = last_included_range->end_point, | ||
| 151 | }; | ||
| 152 | ts_lexer__clear_chunk(self); | ||
| 153 | self->lookahead_size = 1; | ||
| 154 | self->data.lookahead = '\0'; | ||
| 155 | } | ||
| 156 | } | ||
| 157 | |||
| 158 | // Intended to be called only from functions that control logging. | ||
| 159 | static void ts_lexer__do_advance(Lexer *self, bool skip) { | ||
| 160 | if (self->lookahead_size) { | ||
| 161 | self->current_position.bytes += self->lookahead_size; | ||
| 162 | if (self->data.lookahead == '\n') { | ||
| 163 | self->current_position.extent.row++; | ||
| 164 | self->current_position.extent.column = 0; | ||
| 165 | } else { | ||
| 166 | self->current_position.extent.column += self->lookahead_size; | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | const TSRange *current_range = &self->included_ranges[self->current_included_range_index]; | ||
| 171 | while ( | ||
| 172 | self->current_position.bytes >= current_range->end_byte || | ||
| 173 | current_range->end_byte == current_range->start_byte | ||
| 174 | ) { | ||
| 175 | if (self->current_included_range_index < self->included_range_count) { | ||
| 176 | self->current_included_range_index++; | ||
| 177 | } | ||
| 178 | if (self->current_included_range_index < self->included_range_count) { | ||
| 179 | current_range++; | ||
| 180 | self->current_position = (Length) { | ||
| 181 | current_range->start_byte, | ||
| 182 | current_range->start_point, | ||
| 183 | }; | ||
| 184 | } else { | ||
| 185 | current_range = NULL; | ||
| 186 | break; | ||
| 187 | } | ||
| 188 | } | ||
| 189 | |||
| 190 | if (skip) self->token_start_position = self->current_position; | ||
| 191 | |||
| 192 | if (current_range) { | ||
| 193 | if ( | ||
| 194 | self->current_position.bytes < self->chunk_start || | ||
| 195 | self->current_position.bytes >= self->chunk_start + self->chunk_size | ||
| 196 | ) { | ||
| 197 | ts_lexer__get_chunk(self); | ||
| 198 | } | ||
| 199 | ts_lexer__get_lookahead(self); | ||
| 200 | } else { | ||
| 201 | ts_lexer__clear_chunk(self); | ||
| 202 | self->data.lookahead = '\0'; | ||
| 203 | self->lookahead_size = 1; | ||
| 204 | } | ||
| 205 | } | ||
| 206 | |||
| 207 | // Advance to the next character in the source code, retrieving a new | ||
| 208 | // chunk of source code if needed. | ||
| 209 | static void ts_lexer__advance(TSLexer *_self, bool skip) { | ||
| 210 | Lexer *self = (Lexer *)_self; | ||
| 211 | if (!self->chunk) return; | ||
| 212 | |||
| 213 | if (skip) { | ||
| 214 | LOG("skip", self->data.lookahead) | ||
| 215 | } else { | ||
| 216 | LOG("consume", self->data.lookahead) | ||
| 217 | } | ||
| 218 | |||
| 219 | ts_lexer__do_advance(self, skip); | ||
| 220 | } | ||
| 221 | |||
| 222 | // Mark that a token match has completed. This can be called multiple | ||
| 223 | // times if a longer match is found later. | ||
| 224 | static void ts_lexer__mark_end(TSLexer *_self) { | ||
| 225 | Lexer *self = (Lexer *)_self; | ||
| 226 | if (!ts_lexer__eof(&self->data)) { | ||
| 227 | // If the lexer is right at the beginning of included range, | ||
| 228 | // then the token should be considered to end at the *end* of the | ||
| 229 | // previous included range, rather than here. | ||
| 230 | TSRange *current_included_range = &self->included_ranges[ | ||
| 231 | self->current_included_range_index | ||
| 232 | ]; | ||
| 233 | if ( | ||
| 234 | self->current_included_range_index > 0 && | ||
| 235 | self->current_position.bytes == current_included_range->start_byte | ||
| 236 | ) { | ||
| 237 | TSRange *previous_included_range = current_included_range - 1; | ||
| 238 | self->token_end_position = (Length) { | ||
| 239 | previous_included_range->end_byte, | ||
| 240 | previous_included_range->end_point, | ||
| 241 | }; | ||
| 242 | return; | ||
| 243 | } | ||
| 244 | } | ||
| 245 | self->token_end_position = self->current_position; | ||
| 246 | } | ||
| 247 | |||
| 248 | static uint32_t ts_lexer__get_column(TSLexer *_self) { | ||
| 249 | Lexer *self = (Lexer *)_self; | ||
| 250 | |||
| 251 | uint32_t goal_byte = self->current_position.bytes; | ||
| 252 | |||
| 253 | self->did_get_column = true; | ||
| 254 | self->current_position.bytes -= self->current_position.extent.column; | ||
| 255 | self->current_position.extent.column = 0; | ||
| 256 | |||
| 257 | if (self->current_position.bytes < self->chunk_start) { | ||
| 258 | ts_lexer__get_chunk(self); | ||
| 259 | } | ||
| 260 | |||
| 261 | uint32_t result = 0; | ||
| 262 | if (!ts_lexer__eof(_self)) { | ||
| 263 | ts_lexer__get_lookahead(self); | ||
| 264 | while (self->current_position.bytes < goal_byte && self->chunk) { | ||
| 265 | result++; | ||
| 266 | ts_lexer__do_advance(self, false); | ||
| 267 | if (ts_lexer__eof(_self)) break; | ||
| 268 | } | ||
| 269 | } | ||
| 270 | |||
| 271 | return result; | ||
| 272 | } | ||
| 273 | |||
| 274 | // Is the lexer at a boundary between two disjoint included ranges of | ||
| 275 | // source code? This is exposed as an API because some languages' external | ||
| 276 | // scanners need to perform custom actions at these boundaries. | ||
| 277 | static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) { | ||
| 278 | const Lexer *self = (const Lexer *)_self; | ||
| 279 | if (self->current_included_range_index < self->included_range_count) { | ||
| 280 | TSRange *current_range = &self->included_ranges[self->current_included_range_index]; | ||
| 281 | return self->current_position.bytes == current_range->start_byte; | ||
| 282 | } else { | ||
| 283 | return false; | ||
| 284 | } | ||
| 285 | } | ||
| 286 | |||
| 287 | void ts_lexer_init(Lexer *self) { | ||
| 288 | *self = (Lexer) { | ||
| 289 | .data = { | ||
| 290 | // The lexer's methods are stored as struct fields so that generated | ||
| 291 | // parsers can call them without needing to be linked against this | ||
| 292 | // library. | ||
| 293 | .advance = ts_lexer__advance, | ||
| 294 | .mark_end = ts_lexer__mark_end, | ||
| 295 | .get_column = ts_lexer__get_column, | ||
| 296 | .is_at_included_range_start = ts_lexer__is_at_included_range_start, | ||
| 297 | .eof = ts_lexer__eof, | ||
| 298 | .lookahead = 0, | ||
| 299 | .result_symbol = 0, | ||
| 300 | }, | ||
| 301 | .chunk = NULL, | ||
| 302 | .chunk_size = 0, | ||
| 303 | .chunk_start = 0, | ||
| 304 | .current_position = {0, {0, 0}}, | ||
| 305 | .logger = { | ||
| 306 | .payload = NULL, | ||
| 307 | .log = NULL | ||
| 308 | }, | ||
| 309 | .included_ranges = NULL, | ||
| 310 | .included_range_count = 0, | ||
| 311 | .current_included_range_index = 0, | ||
| 312 | }; | ||
| 313 | ts_lexer_set_included_ranges(self, NULL, 0); | ||
| 314 | } | ||
| 315 | |||
| 316 | void ts_lexer_delete(Lexer *self) { | ||
| 317 | ts_free(self->included_ranges); | ||
| 318 | } | ||
| 319 | |||
| 320 | void ts_lexer_set_input(Lexer *self, TSInput input) { | ||
| 321 | self->input = input; | ||
| 322 | ts_lexer__clear_chunk(self); | ||
| 323 | ts_lexer_goto(self, self->current_position); | ||
| 324 | } | ||
| 325 | |||
| 326 | // Move the lexer to the given position. This doesn't do any work | ||
| 327 | // if the parser is already at the given position. | ||
| 328 | void ts_lexer_reset(Lexer *self, Length position) { | ||
| 329 | if (position.bytes != self->current_position.bytes) { | ||
| 330 | ts_lexer_goto(self, position); | ||
| 331 | } | ||
| 332 | } | ||
| 333 | |||
| 334 | void ts_lexer_start(Lexer *self) { | ||
| 335 | self->token_start_position = self->current_position; | ||
| 336 | self->token_end_position = LENGTH_UNDEFINED; | ||
| 337 | self->data.result_symbol = 0; | ||
| 338 | self->did_get_column = false; | ||
| 339 | if (!ts_lexer__eof(&self->data)) { | ||
| 340 | if (!self->chunk_size) ts_lexer__get_chunk(self); | ||
| 341 | if (!self->lookahead_size) ts_lexer__get_lookahead(self); | ||
| 342 | if ( | ||
| 343 | self->current_position.bytes == 0 && | ||
| 344 | self->data.lookahead == BYTE_ORDER_MARK | ||
| 345 | ) ts_lexer__advance(&self->data, true); | ||
| 346 | } | ||
| 347 | } | ||
| 348 | |||
| 349 | void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) { | ||
| 350 | if (length_is_undefined(self->token_end_position)) { | ||
| 351 | ts_lexer__mark_end(&self->data); | ||
| 352 | } | ||
| 353 | |||
| 354 | // If the token ended at an included range boundary, then its end position | ||
| 355 | // will have been reset to the end of the preceding range. Reset the start | ||
| 356 | // position to match. | ||
| 357 | if (self->token_end_position.bytes < self->token_start_position.bytes) { | ||
| 358 | self->token_start_position = self->token_end_position; | ||
| 359 | } | ||
| 360 | |||
| 361 | uint32_t current_lookahead_end_byte = self->current_position.bytes + 1; | ||
| 362 | |||
| 363 | // In order to determine that a byte sequence is invalid UTF8 or UTF16, | ||
| 364 | // the character decoding algorithm may have looked at the following byte. | ||
| 365 | // Therefore, the next byte *after* the current (invalid) character | ||
| 366 | // affects the interpretation of the current character. | ||
| 367 | if (self->data.lookahead == TS_DECODE_ERROR) { | ||
| 368 | current_lookahead_end_byte++; | ||
| 369 | } | ||
| 370 | |||
| 371 | if (current_lookahead_end_byte > *lookahead_end_byte) { | ||
| 372 | *lookahead_end_byte = current_lookahead_end_byte; | ||
| 373 | } | ||
| 374 | } | ||
| 375 | |||
| 376 | void ts_lexer_advance_to_end(Lexer *self) { | ||
| 377 | while (self->chunk) { | ||
| 378 | ts_lexer__advance(&self->data, false); | ||
| 379 | } | ||
| 380 | } | ||
| 381 | |||
| 382 | void ts_lexer_mark_end(Lexer *self) { | ||
| 383 | ts_lexer__mark_end(&self->data); | ||
| 384 | } | ||
| 385 | |||
| 386 | bool ts_lexer_set_included_ranges( | ||
| 387 | Lexer *self, | ||
| 388 | const TSRange *ranges, | ||
| 389 | uint32_t count | ||
| 390 | ) { | ||
| 391 | if (count == 0 || !ranges) { | ||
| 392 | ranges = &DEFAULT_RANGE; | ||
| 393 | count = 1; | ||
| 394 | } else { | ||
| 395 | uint32_t previous_byte = 0; | ||
| 396 | for (unsigned i = 0; i < count; i++) { | ||
| 397 | const TSRange *range = &ranges[i]; | ||
| 398 | if ( | ||
| 399 | range->start_byte < previous_byte || | ||
| 400 | range->end_byte < range->start_byte | ||
| 401 | ) return false; | ||
| 402 | previous_byte = range->end_byte; | ||
| 403 | } | ||
| 404 | } | ||
| 405 | |||
| 406 | size_t size = count * sizeof(TSRange); | ||
| 407 | self->included_ranges = ts_realloc(self->included_ranges, size); | ||
| 408 | memcpy(self->included_ranges, ranges, size); | ||
| 409 | self->included_range_count = count; | ||
| 410 | ts_lexer_goto(self, self->current_position); | ||
| 411 | return true; | ||
| 412 | } | ||
| 413 | |||
| 414 | TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) { | ||
| 415 | *count = self->included_range_count; | ||
| 416 | return self->included_ranges; | ||
| 417 | } | ||
| 418 | |||
| 419 | #undef LOG | ||
