1#include <stdio.h>
  2#include "./lexer.h"
  3#include "./subtree.h"
  4#include "./length.h"
  5#include "./unicode.h"
  6
  7#define LOG(message, character)              \
  8  if (self->logger.log) {                    \
  9    snprintf(                                \
 10      self->debug_buffer,                    \
 11      TREE_SITTER_SERIALIZATION_BUFFER_SIZE, \
 12      32 <= character && character < 127 ?   \
 13        message " character:'%c'" :          \
 14        message " character:%d",             \
 15      character                              \
 16    );                                       \
 17    self->logger.log(                        \
 18      self->logger.payload,                  \
 19      TSLogTypeLex,                          \
 20      self->debug_buffer                     \
 21    );                                       \
 22  }
 23
 24static const int32_t BYTE_ORDER_MARK = 0xFEFF;
 25
 26static const TSRange DEFAULT_RANGE = {
 27  .start_point = {
 28    .row = 0,
 29    .column = 0,
 30  },
 31  .end_point = {
 32    .row = UINT32_MAX,
 33    .column = UINT32_MAX,
 34  },
 35  .start_byte = 0,
 36  .end_byte = UINT32_MAX
 37};
 38
 39// Check if the lexer has reached EOF. This state is stored
 40// by setting the lexer's `current_included_range_index` such that
 41// it has consumed all of its available ranges.
 42static bool ts_lexer__eof(const TSLexer *_self) {
 43  Lexer *self = (Lexer *)_self;
 44  return self->current_included_range_index == self->included_range_count;
 45}
 46
 47// Clear the currently stored chunk of source code, because the lexer's
 48// position has changed.
 49static void ts_lexer__clear_chunk(Lexer *self) {
 50  self->chunk = NULL;
 51  self->chunk_size = 0;
 52  self->chunk_start = 0;
 53}
 54
 55// Call the lexer's input callback to obtain a new chunk of source code
 56// for the current position.
 57static void ts_lexer__get_chunk(Lexer *self) {
 58  self->chunk_start = self->current_position.bytes;
 59  self->chunk = self->input.read(
 60    self->input.payload,
 61    self->current_position.bytes,
 62    self->current_position.extent,
 63    &self->chunk_size
 64  );
 65  if (!self->chunk_size) {
 66    self->current_included_range_index = self->included_range_count;
 67    self->chunk = NULL;
 68  }
 69}
 70
 71// Decode the next unicode character in the current chunk of source code.
 72// This assumes that the lexer has already retrieved a chunk of source
 73// code that spans the current position.
 74static void ts_lexer__get_lookahead(Lexer *self) {
 75  uint32_t position_in_chunk = self->current_position.bytes - self->chunk_start;
 76  uint32_t size = self->chunk_size - position_in_chunk;
 77
 78  if (size == 0) {
 79    self->lookahead_size = 1;
 80    self->data.lookahead = '\0';
 81    return;
 82  }
 83
 84  const uint8_t *chunk = (const uint8_t *)self->chunk + position_in_chunk;
 85  UnicodeDecodeFunction decode = self->input.encoding == TSInputEncodingUTF8
 86    ? ts_decode_utf8
 87    : ts_decode_utf16;
 88
 89  self->lookahead_size = decode(chunk, size, &self->data.lookahead);
 90
 91  // If this chunk ended in the middle of a multi-byte character,
 92  // try again with a fresh chunk.
 93  if (self->data.lookahead == TS_DECODE_ERROR && size < 4) {
 94    ts_lexer__get_chunk(self);
 95    chunk = (const uint8_t *)self->chunk;
 96    size = self->chunk_size;
 97    self->lookahead_size = decode(chunk, size, &self->data.lookahead);
 98  }
 99
100  if (self->data.lookahead == TS_DECODE_ERROR) {
101    self->lookahead_size = 1;
102  }
103}
104
105static void ts_lexer_goto(Lexer *self, Length position) {
106  self->current_position = position;
107
108  // Move to the first valid position at or after the given position.
109  bool found_included_range = false;
110  for (unsigned i = 0; i < self->included_range_count; i++) {
111    TSRange *included_range = &self->included_ranges[i];
112    if (
113      included_range->end_byte > self->current_position.bytes &&
114      included_range->end_byte > included_range->start_byte
115    ) {
116      if (included_range->start_byte >= self->current_position.bytes) {
117        self->current_position = (Length) {
118          .bytes = included_range->start_byte,
119          .extent = included_range->start_point,
120        };
121      }
122
123      self->current_included_range_index = i;
124      found_included_range = true;
125      break;
126    }
127  }
128
129  if (found_included_range) {
130    // If the current position is outside of the current chunk of text,
131    // then clear out the current chunk of text.
132    if (self->chunk && (
133      self->current_position.bytes < self->chunk_start ||
134      self->current_position.bytes >= self->chunk_start + self->chunk_size
135    )) {
136      ts_lexer__clear_chunk(self);
137    }
138
139    self->lookahead_size = 0;
140    self->data.lookahead = '\0';
141  }
142
143  // If the given position is beyond any of included ranges, move to the EOF
144  // state - past the end of the included ranges.
145  else {
146    self->current_included_range_index = self->included_range_count;
147    TSRange *last_included_range = &self->included_ranges[self->included_range_count - 1];
148    self->current_position = (Length) {
149      .bytes = last_included_range->end_byte,
150      .extent = last_included_range->end_point,
151    };
152    ts_lexer__clear_chunk(self);
153    self->lookahead_size = 1;
154    self->data.lookahead = '\0';
155  }
156}
157
158// Intended to be called only from functions that control logging.
159static void ts_lexer__do_advance(Lexer *self, bool skip) {
160  if (self->lookahead_size) {
161    self->current_position.bytes += self->lookahead_size;
162    if (self->data.lookahead == '\n') {
163      self->current_position.extent.row++;
164      self->current_position.extent.column = 0;
165    } else {
166      self->current_position.extent.column += self->lookahead_size;
167    }
168  }
169
170  const TSRange *current_range = &self->included_ranges[self->current_included_range_index];
171  while (
172    self->current_position.bytes >= current_range->end_byte ||
173    current_range->end_byte == current_range->start_byte
174  ) {
175    if (self->current_included_range_index < self->included_range_count) {
176      self->current_included_range_index++;
177    }
178    if (self->current_included_range_index < self->included_range_count) {
179      current_range++;
180      self->current_position = (Length) {
181        current_range->start_byte,
182        current_range->start_point,
183      };
184    } else {
185      current_range = NULL;
186      break;
187    }
188  }
189
190  if (skip) self->token_start_position = self->current_position;
191
192  if (current_range) {
193    if (
194      self->current_position.bytes < self->chunk_start ||
195      self->current_position.bytes >= self->chunk_start + self->chunk_size
196    ) {
197      ts_lexer__get_chunk(self);
198    }
199    ts_lexer__get_lookahead(self);
200  } else {
201    ts_lexer__clear_chunk(self);
202    self->data.lookahead = '\0';
203    self->lookahead_size = 1;
204  }
205}
206
207// Advance to the next character in the source code, retrieving a new
208// chunk of source code if needed.
209static void ts_lexer__advance(TSLexer *_self, bool skip) {
210  Lexer *self = (Lexer *)_self;
211  if (!self->chunk) return;
212
213  if (skip) {
214    LOG("skip", self->data.lookahead)
215  } else {
216    LOG("consume", self->data.lookahead)
217  }
218
219  ts_lexer__do_advance(self, skip);
220}
221
222// Mark that a token match has completed. This can be called multiple
223// times if a longer match is found later.
224static void ts_lexer__mark_end(TSLexer *_self) {
225  Lexer *self = (Lexer *)_self;
226  if (!ts_lexer__eof(&self->data)) {
227    // If the lexer is right at the beginning of included range,
228    // then the token should be considered to end at the *end* of the
229    // previous included range, rather than here.
230    TSRange *current_included_range = &self->included_ranges[
231      self->current_included_range_index
232    ];
233    if (
234      self->current_included_range_index > 0 &&
235      self->current_position.bytes == current_included_range->start_byte
236    ) {
237      TSRange *previous_included_range = current_included_range - 1;
238      self->token_end_position = (Length) {
239        previous_included_range->end_byte,
240        previous_included_range->end_point,
241      };
242      return;
243    }
244  }
245  self->token_end_position = self->current_position;
246}
247
248static uint32_t ts_lexer__get_column(TSLexer *_self) {
249  Lexer *self = (Lexer *)_self;
250
251  uint32_t goal_byte = self->current_position.bytes;
252
253  self->did_get_column = true;
254  self->current_position.bytes -= self->current_position.extent.column;
255  self->current_position.extent.column = 0;
256
257  if (self->current_position.bytes < self->chunk_start) {
258    ts_lexer__get_chunk(self);
259  }
260
261  uint32_t result = 0;
262  if (!ts_lexer__eof(_self)) {
263    ts_lexer__get_lookahead(self);
264    while (self->current_position.bytes < goal_byte && self->chunk) {
265      result++;
266      ts_lexer__do_advance(self, false);
267      if (ts_lexer__eof(_self)) break;
268    }
269  }
270
271  return result;
272}
273
274// Is the lexer at a boundary between two disjoint included ranges of
275// source code? This is exposed as an API because some languages' external
276// scanners need to perform custom actions at these boundaries.
277static bool ts_lexer__is_at_included_range_start(const TSLexer *_self) {
278  const Lexer *self = (const Lexer *)_self;
279  if (self->current_included_range_index < self->included_range_count) {
280    TSRange *current_range = &self->included_ranges[self->current_included_range_index];
281    return self->current_position.bytes == current_range->start_byte;
282  } else {
283    return false;
284  }
285}
286
287void ts_lexer_init(Lexer *self) {
288  *self = (Lexer) {
289    .data = {
290      // The lexer's methods are stored as struct fields so that generated
291      // parsers can call them without needing to be linked against this
292      // library.
293      .advance = ts_lexer__advance,
294      .mark_end = ts_lexer__mark_end,
295      .get_column = ts_lexer__get_column,
296      .is_at_included_range_start = ts_lexer__is_at_included_range_start,
297      .eof = ts_lexer__eof,
298      .lookahead = 0,
299      .result_symbol = 0,
300    },
301    .chunk = NULL,
302    .chunk_size = 0,
303    .chunk_start = 0,
304    .current_position = {0, {0, 0}},
305    .logger = {
306      .payload = NULL,
307      .log = NULL
308    },
309    .included_ranges = NULL,
310    .included_range_count = 0,
311    .current_included_range_index = 0,
312  };
313  ts_lexer_set_included_ranges(self, NULL, 0);
314}
315
316void ts_lexer_delete(Lexer *self) {
317  ts_free(self->included_ranges);
318}
319
320void ts_lexer_set_input(Lexer *self, TSInput input) {
321  self->input = input;
322  ts_lexer__clear_chunk(self);
323  ts_lexer_goto(self, self->current_position);
324}
325
326// Move the lexer to the given position. This doesn't do any work
327// if the parser is already at the given position.
328void ts_lexer_reset(Lexer *self, Length position) {
329  if (position.bytes != self->current_position.bytes) {
330    ts_lexer_goto(self, position);
331  }
332}
333
334void ts_lexer_start(Lexer *self) {
335  self->token_start_position = self->current_position;
336  self->token_end_position = LENGTH_UNDEFINED;
337  self->data.result_symbol = 0;
338  self->did_get_column = false;
339  if (!ts_lexer__eof(&self->data)) {
340    if (!self->chunk_size) ts_lexer__get_chunk(self);
341    if (!self->lookahead_size) ts_lexer__get_lookahead(self);
342    if (
343      self->current_position.bytes == 0 &&
344      self->data.lookahead == BYTE_ORDER_MARK
345    ) ts_lexer__advance(&self->data, true);
346  }
347}
348
349void ts_lexer_finish(Lexer *self, uint32_t *lookahead_end_byte) {
350  if (length_is_undefined(self->token_end_position)) {
351    ts_lexer__mark_end(&self->data);
352  }
353
354  // If the token ended at an included range boundary, then its end position
355  // will have been reset to the end of the preceding range. Reset the start
356  // position to match.
357  if (self->token_end_position.bytes < self->token_start_position.bytes) {
358    self->token_start_position = self->token_end_position;
359  }
360
361  uint32_t current_lookahead_end_byte = self->current_position.bytes + 1;
362
363  // In order to determine that a byte sequence is invalid UTF8 or UTF16,
364  // the character decoding algorithm may have looked at the following byte.
365  // Therefore, the next byte *after* the current (invalid) character
366  // affects the interpretation of the current character.
367  if (self->data.lookahead == TS_DECODE_ERROR) {
368    current_lookahead_end_byte++;
369  }
370
371  if (current_lookahead_end_byte > *lookahead_end_byte) {
372    *lookahead_end_byte = current_lookahead_end_byte;
373  }
374}
375
376void ts_lexer_advance_to_end(Lexer *self) {
377  while (self->chunk) {
378    ts_lexer__advance(&self->data, false);
379  }
380}
381
382void ts_lexer_mark_end(Lexer *self) {
383  ts_lexer__mark_end(&self->data);
384}
385
386bool ts_lexer_set_included_ranges(
387  Lexer *self,
388  const TSRange *ranges,
389  uint32_t count
390) {
391  if (count == 0 || !ranges) {
392    ranges = &DEFAULT_RANGE;
393    count = 1;
394  } else {
395    uint32_t previous_byte = 0;
396    for (unsigned i = 0; i < count; i++) {
397      const TSRange *range = &ranges[i];
398      if (
399        range->start_byte < previous_byte ||
400        range->end_byte < range->start_byte
401      ) return false;
402      previous_byte = range->end_byte;
403    }
404  }
405
406  size_t size = count * sizeof(TSRange);
407  self->included_ranges = ts_realloc(self->included_ranges, size);
408  memcpy(self->included_ranges, ranges, size);
409  self->included_range_count = count;
410  ts_lexer_goto(self, self->current_position);
411  return true;
412}
413
414TSRange *ts_lexer_included_ranges(const Lexer *self, uint32_t *count) {
415  *count = self->included_range_count;
416  return self->included_ranges;
417}
418
419#undef LOG