Add Rust, Go and rename examples to tests

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-01-22 00:35:39 +0100
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-01-22 00:35:39 +0100
commit: 52040cc19cbdca48f91d4eb91e9b7a782bb5fbd0 (patch)
tree: 8c6a61f5a6db99c4c7a663e1e2c0f069c3794c4b /vendor/tree-sitter-rust/src/scanner.c
parent: 8ab1da7853f6dd309f2d3677ca109737f929ab4a (diff)
download: crep-52040cc19cbdca48f91d4eb91e9b7a782bb5fbd0.tar.gz
1 files changed, 393 insertions, 0 deletions
diff --git a/vendor/tree-sitter-rust/src/scanner.c b/vendor/tree-sitter-rust/src/scanner.c
new file mode 100644
index 0000000..269f6b2
--- /dev/null
+++ b/vendor/tree-sitter-rust/src/scanner.c
@@ -0,0 +1,393 @@
+#include "tree_sitter/alloc.h"
+#include "tree_sitter/parser.h"
+
+#include <wctype.h>
+
+enum TokenType {
+    STRING_CONTENT,
+    RAW_STRING_LITERAL_START,
+    RAW_STRING_LITERAL_CONTENT,
+    RAW_STRING_LITERAL_END,
+    FLOAT_LITERAL,
+    BLOCK_OUTER_DOC_MARKER,
+    BLOCK_INNER_DOC_MARKER,
+    BLOCK_COMMENT_CONTENT,
+    LINE_DOC_CONTENT,
+    ERROR_SENTINEL
+};
+
+typedef struct {
+    uint8_t opening_hash_count;
+} Scanner;
+
+void *tree_sitter_rust_external_scanner_create() { return ts_calloc(1, sizeof(Scanner)); }
+
+void tree_sitter_rust_external_scanner_destroy(void *payload) { ts_free((Scanner *)payload); }
+
+unsigned tree_sitter_rust_external_scanner_serialize(void *payload, char *buffer) {
+    Scanner *scanner = (Scanner *)payload;
+    buffer[0] = (char)scanner->opening_hash_count;
+    return 1;
+}
+
+void tree_sitter_rust_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
+    Scanner *scanner = (Scanner *)payload;
+    scanner->opening_hash_count = 0;
+    if (length == 1) {
+        Scanner *scanner = (Scanner *)payload;
+        scanner->opening_hash_count = buffer[0];
+    }
+}
+
+static inline bool is_num_char(int32_t c) { return c == '_' || iswdigit(c); }
+
+static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+
+static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
+static inline bool process_string(TSLexer *lexer) {
+    bool has_content = false;
+    for (;;) {
+        if (lexer->lookahead == '\"' || lexer->lookahead == '\\') {
+            break;
+        }
+        if (lexer->eof(lexer)) {
+            return false;
+        }
+        has_content = true;
+        advance(lexer);
+    }
+    lexer->result_symbol = STRING_CONTENT;
+    lexer->mark_end(lexer);
+    return has_content;
+}
+
+static inline bool scan_raw_string_start(Scanner *scanner, TSLexer *lexer) {
+    if (lexer->lookahead == 'b' || lexer->lookahead == 'c') {
+        advance(lexer);
+    }
+    if (lexer->lookahead != 'r') {
+        return false;
+    }
+    advance(lexer);
+
+    uint8_t opening_hash_count = 0;
+    while (lexer->lookahead == '#') {
+        advance(lexer);
+        opening_hash_count++;
+    }
+
+    if (lexer->lookahead != '"') {
+        return false;
+    }
+    advance(lexer);
+    scanner->opening_hash_count = opening_hash_count;
+
+    lexer->result_symbol = RAW_STRING_LITERAL_START;
+    return true;
+}
+
+static inline bool scan_raw_string_content(Scanner *scanner, TSLexer *lexer) {
+    for (;;) {
+        if (lexer->eof(lexer)) {
+            return false;
+        }
+        if (lexer->lookahead == '"') {
+            lexer->mark_end(lexer);
+            advance(lexer);
+            unsigned hash_count = 0;
+            while (lexer->lookahead == '#' && hash_count < scanner->opening_hash_count) {
+                advance(lexer);
+                hash_count++;
+            }
+            if (hash_count == scanner->opening_hash_count) {
+                lexer->result_symbol = RAW_STRING_LITERAL_CONTENT;
+                return true;
+            }
+        } else {
+            advance(lexer);
+        }
+    }
+}
+
+static inline bool scan_raw_string_end(Scanner *scanner, TSLexer *lexer) {
+    advance(lexer);
+    for (unsigned i = 0; i < scanner->opening_hash_count; i++) {
+        advance(lexer);
+    }
+    lexer->result_symbol = RAW_STRING_LITERAL_END;
+    return true;
+}
+
+static inline bool process_float_literal(TSLexer *lexer) {
+    lexer->result_symbol = FLOAT_LITERAL;
+
+    advance(lexer);
+    while (is_num_char(lexer->lookahead)) {
+        advance(lexer);
+    }
+
+    bool has_fraction = false, has_exponent = false;
+
+    if (lexer->lookahead == '.') {
+        has_fraction = true;
+        advance(lexer);
+        if (iswalpha(lexer->lookahead)) {
+            // The dot is followed by a letter: 1.max(2) => not a float but an integer
+            return false;
+        }
+
+        if (lexer->lookahead == '.') {
+            return false;
+        }
+        while (is_num_char(lexer->lookahead)) {
+            advance(lexer);
+        }
+    }
+
+    lexer->mark_end(lexer);
+
+    if (lexer->lookahead == 'e' || lexer->lookahead == 'E') {
+        has_exponent = true;
+        advance(lexer);
+        if (lexer->lookahead == '+' || lexer->lookahead == '-') {
+            advance(lexer);
+        }
+        if (!is_num_char(lexer->lookahead)) {
+            return true;
+        }
+        advance(lexer);
+        while (is_num_char(lexer->lookahead)) {
+            advance(lexer);
+        }
+
+        lexer->mark_end(lexer);
+    }
+
+    if (!has_exponent && !has_fraction) {
+        return false;
+    }
+
+    if (lexer->lookahead != 'u' && lexer->lookahead != 'i' && lexer->lookahead != 'f') {
+        return true;
+    }
+    advance(lexer);
+    if (!iswdigit(lexer->lookahead)) {
+        return true;
+    }
+
+    while (iswdigit(lexer->lookahead)) {
+        advance(lexer);
+    }
+
+    lexer->mark_end(lexer);
+    return true;
+}
+
+static inline bool process_line_doc_content(TSLexer *lexer) {
+    lexer->result_symbol = LINE_DOC_CONTENT;
+    for (;;) {
+        if (lexer->eof(lexer)) {
+            return true;
+        }
+        if (lexer->lookahead == '\n') {
+            // Include the newline in the doc content node.
+            // Line endings are useful for markdown injection.
+            advance(lexer);
+            return true;
+        }
+        advance(lexer);
+    }
+}
+
+typedef enum {
+    LeftForwardSlash,
+    LeftAsterisk,
+    Continuing,
+} BlockCommentState;
+
+typedef struct {
+    BlockCommentState state;
+    unsigned nestingDepth;
+} BlockCommentProcessing;
+
+static inline void process_left_forward_slash(BlockCommentProcessing *processing, char current) {
+    if (current == '*') {
+        processing->nestingDepth += 1;
+    }
+    processing->state = Continuing;
+};
+
+static inline void process_left_asterisk(BlockCommentProcessing *processing, char current, TSLexer *lexer) {
+    if (current == '*') {
+        lexer->mark_end(lexer);
+        processing->state = LeftAsterisk;
+        return;
+    }
+
+    if (current == '/') {
+        processing->nestingDepth -= 1;
+    }
+
+    processing->state = Continuing;
+}
+
+static inline void process_continuing(BlockCommentProcessing *processing, char current) {
+    switch (current) {
+        case '/':
+            processing->state = LeftForwardSlash;
+            break;
+        case '*':
+            processing->state = LeftAsterisk;
+            break;
+    }
+}
+
+static inline bool process_block_comment(TSLexer *lexer, const bool *valid_symbols) {
+    char first = (char)lexer->lookahead;
+    // The first character is stored so we can safely advance inside
+    // these if blocks. However, because we only store one, we can only
+    // safely advance 1 time. Since there's a chance that an advance could
+    // happen in one state, we must advance in all states to ensure that
+    // the program ends up in a sane state prior to processing the block
+    // comment if need be.
+    if (valid_symbols[BLOCK_INNER_DOC_MARKER] && first == '!') {
+        lexer->result_symbol = BLOCK_INNER_DOC_MARKER;
+        advance(lexer);
+        return true;
+    }
+    if (valid_symbols[BLOCK_OUTER_DOC_MARKER] && first == '*') {
+        advance(lexer);
+        lexer->mark_end(lexer);
+        // If the next token is a / that means that it's an empty block comment.
+        if (lexer->lookahead == '/') {
+            return false;
+        }
+        // If the next token is a * that means that this isn't a BLOCK_OUTER_DOC_MARKER
+        // as BLOCK_OUTER_DOC_MARKER's only have 2 * not 3 or more.
+        if (lexer->lookahead != '*') {
+            lexer->result_symbol = BLOCK_OUTER_DOC_MARKER;
+            return true;
+        }
+    } else {
+        advance(lexer);
+    }
+
+    if (valid_symbols[BLOCK_COMMENT_CONTENT]) {
+        BlockCommentProcessing processing = {Continuing, 1};
+        // Manually set the current state based on the first character
+        switch (first) {
+            case '*':
+                processing.state = LeftAsterisk;
+                if (lexer->lookahead == '/') {
+                    // This case can happen in an empty doc block comment
+                    // like /*!*/. The comment has no contents, so bail.
+                    return false;
+                }
+                break;
+            case '/':
+                processing.state = LeftForwardSlash;
+                break;
+            default:
+                processing.state = Continuing;
+                break;
+        }
+
+        // For the purposes of actually parsing rust code, this
+        // is incorrect as it considers an unterminated block comment
+        // to be an error. However, for the purposes of syntax highlighting
+        // this should be considered successful as otherwise you are not able
+        // to syntax highlight a block of code prior to closing the
+        // block comment
+        while (!lexer->eof(lexer) && processing.nestingDepth != 0) {
+            // Set first to the current lookahead as that is the second character
+            // as we force an advance in the above code when we are checking if we
+            // need to handle a block comment inner or outer doc comment signifier
+            // node
+            first = (char)lexer->lookahead;
+            switch (processing.state) {
+                case LeftForwardSlash:
+                    process_left_forward_slash(&processing, first);
+                    break;
+                case LeftAsterisk:
+                    process_left_asterisk(&processing, first, lexer);
+                    break;
+                case Continuing:
+                    lexer->mark_end(lexer);
+                    process_continuing(&processing, first);
+                    break;
+                default:
+                    break;
+            }
+            advance(lexer);
+            if (first == '/' && processing.nestingDepth != 0) {
+                lexer->mark_end(lexer);
+            }
+        }
+        lexer->result_symbol = BLOCK_COMMENT_CONTENT;
+        return true;
+    }
+
+    return false;
+}
+
+bool tree_sitter_rust_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
+    // The documentation states that if the lexical analysis fails for some reason
+    // they will mark every state as valid and pass it to the external scanner
+    // However, we can't do anything to help them recover in that case so we
+    // should just fail.
+    /*
+      link: https://tree-sitter.github.io/tree-sitter/creating-parsers#external-scanners
+      If a syntax error is encountered during regular parsing, Tree-sitter’s
+      first action during error recovery will be to call the external scanner’s
+      scan function with all tokens marked valid. The scanner should detect this
+      case and handle it appropriately. One simple method of detection is to add
+      an unused token to the end of the externals array, for example
+
+      externals: $ => [$.token1, $.token2, $.error_sentinel],
+
+      then check whether that token is marked valid to determine whether
+      Tree-sitter is in error correction mode.
+    */
+    if (valid_symbols[ERROR_SENTINEL]) {
+        return false;
+    }
+
+    Scanner *scanner = (Scanner *)payload;
+
+    if (valid_symbols[BLOCK_COMMENT_CONTENT] || valid_symbols[BLOCK_INNER_DOC_MARKER] ||
+        valid_symbols[BLOCK_OUTER_DOC_MARKER]) {
+        return process_block_comment(lexer, valid_symbols);
+    }
+
+    if (valid_symbols[STRING_CONTENT] && !valid_symbols[FLOAT_LITERAL]) {
+        return process_string(lexer);
+    }
+
+    if (valid_symbols[LINE_DOC_CONTENT]) {
+        return process_line_doc_content(lexer);
+    }
+
+    while (iswspace(lexer->lookahead)) {
+        skip(lexer);
+    }
+
+    if (valid_symbols[RAW_STRING_LITERAL_START] &&
+        (lexer->lookahead == 'r' || lexer->lookahead == 'b' || lexer->lookahead == 'c')) {
+        return scan_raw_string_start(scanner, lexer);
+    }
+
+    if (valid_symbols[RAW_STRING_LITERAL_CONTENT]) {
+        return scan_raw_string_content(scanner, lexer);
+    }
+
+    if (valid_symbols[RAW_STRING_LITERAL_END] && lexer->lookahead == '"') {
+        return scan_raw_string_end(scanner, lexer);
+    }
+
+    if (valid_symbols[FLOAT_LITERAL] && iswdigit(lexer->lookahead)) {
+        return process_float_literal(lexer);
+    }
+
+    return false;
+}
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-01-22 00:35:39 +0100
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-01-22 00:35:39 +0100
commit	52040cc19cbdca48f91d4eb91e9b7a782bb5fbd0 (patch)
tree	8c6a61f5a6db99c4c7a663e1e2c0f069c3794c4b /vendor/tree-sitter-rust/src/scanner.c
parent	8ab1da7853f6dd309f2d3677ca109737f929ab4a (diff)
download	crep-52040cc19cbdca48f91d4eb91e9b7a782bb5fbd0.tar.gz