1#include <stdbool.h>
  2#include <stdlib.h>
  3#include <string.h>
  4#include <wctype.h>
  5
  6#include "parser.h"
  7
  8#define MAX_HEREDOCS 10
  9#define DEL_SPACE 512
 10
 11typedef struct {
 12    bool in_heredoc;
 13    bool stripping_heredoc;
 14    unsigned heredoc_count;
 15    char *heredocs[MAX_HEREDOCS];
 16} scanner_state;
 17
 18enum TokenType {
 19    HEREDOC_MARKER,
 20    HEREDOC_LINE,
 21    HEREDOC_END,
 22    HEREDOC_NL,
 23    ERROR_SENTINEL,
 24};
 25
 26void *tree_sitter_dockerfile_external_scanner_create() {
 27    scanner_state *state = malloc(sizeof(scanner_state));
 28    memset(state, 0, sizeof(scanner_state));
 29    return state;
 30}
 31
 32void tree_sitter_dockerfile_external_scanner_destroy(void *payload) {
 33    if (!payload)
 34        return;
 35
 36    scanner_state *state = payload;
 37    for (unsigned i = 0; i < MAX_HEREDOCS; i++) {
 38        if (state->heredocs[i]) {
 39            free(state->heredocs[i]);
 40        }
 41    }
 42
 43    free(state);
 44}
 45
 46unsigned tree_sitter_dockerfile_external_scanner_serialize(void *payload,
 47                                                           char *buffer) {
 48    scanner_state *state = payload;
 49
 50    unsigned pos = 0;
 51    buffer[pos++] = state->in_heredoc;
 52    buffer[pos++] = state->stripping_heredoc;
 53
 54    for (unsigned i = 0; i < state->heredoc_count; i++) {
 55        // Add the ending null byte to the length since we'll have to copy it as
 56        // well.
 57        unsigned len = strlen(state->heredocs[i]) + 1;
 58
 59        // If we run out of space, just drop the heredocs that don't fit.
 60        // We need at least len + 1 bytes space since we'll copy len bytes below
 61        // and later add a null byte at the end.
 62        if (pos + len + 1 > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
 63            break;
 64        }
 65
 66        memcpy(&buffer[pos], state->heredocs[i], len);
 67        pos += len;
 68    }
 69
 70    // Add a null byte at the end to make it easy to detect.
 71    buffer[pos++] = 0;
 72    return pos;
 73}
 74
 75void tree_sitter_dockerfile_external_scanner_deserialize(void *payload,
 76                                                         const char *buffer,
 77                                                         unsigned length) {
 78    scanner_state *state = payload;
 79    // Free all current heredocs to avoid leaking memory when we overwrite the
 80    // array later.
 81    for (unsigned i = 0; i < state->heredoc_count; i++) {
 82        free(state->heredocs[i]);
 83        state->heredocs[i] = NULL;
 84    }
 85
 86    if (length == 0) {
 87        state->in_heredoc = false;
 88        state->stripping_heredoc = false;
 89        state->heredoc_count = 0;
 90    } else {
 91        unsigned pos = 0;
 92        state->in_heredoc = buffer[pos++];
 93        state->stripping_heredoc = buffer[pos++];
 94
 95        unsigned heredoc_count = 0;
 96        for (unsigned i = 0; i < MAX_HEREDOCS; i++) {
 97            unsigned len = strlen(&buffer[pos]);
 98
 99            // We found the ending null byte which means that we're done.
100            if (len == 0)
101                break;
102
103            // Account for the ending null byte in strings (again).
104            len++;
105            char *heredoc = malloc(len);
106            memcpy(heredoc, &buffer[pos], len);
107            state->heredocs[i] = heredoc;
108            heredoc_count++;
109
110            pos += len;
111        }
112
113        state->heredoc_count = heredoc_count;
114    }
115}
116
117static void skip_whitespace(TSLexer *lexer) {
118    while (lexer->lookahead != '\0' && lexer->lookahead != '\n' &&
119           iswspace(lexer->lookahead))
120        lexer->advance(lexer, true);
121}
122
123static bool scan_marker(scanner_state *state, TSLexer *lexer) {
124    skip_whitespace(lexer);
125
126    if (lexer->lookahead != '<')
127        return false;
128    lexer->advance(lexer, false);
129
130    if (lexer->lookahead != '<')
131        return false;
132    lexer->advance(lexer, false);
133
134    bool stripping = false;
135    if (lexer->lookahead == '-') {
136        stripping = true;
137        lexer->advance(lexer, false);
138    }
139
140    int32_t quote = 0;
141    if (lexer->lookahead == '"' || lexer->lookahead == '\'') {
142        quote = lexer->lookahead;
143        lexer->advance(lexer, false);
144    }
145
146    // Reserve a reasonable amount of space for the heredoc delimiter string.
147    // Most heredocs (like EOF, EOT, EOS, FILE, etc.) are pretty short so we'll
148    // usually only need a few bytes. We're also limited to less than 1024 bytes
149    // by tree-sitter since our state has to fit in
150    // TREE_SITTER_SERIALIZATION_BUFFER_SIZE.
151    char delimiter[DEL_SPACE];
152
153    // We start recording the actual string at position 1 since we store whether
154    // it's a stripping heredoc in the first position (with either a dash or a
155    // space).
156    unsigned del_idx = 1;
157
158    while (lexer->lookahead != '\0' &&
159           (quote ? lexer->lookahead != quote : !iswspace(lexer->lookahead))) {
160        if (lexer->lookahead == '\\') {
161            lexer->advance(lexer, false);
162
163            if (lexer->lookahead == '\0') {
164                return false;
165            }
166        }
167
168        if (del_idx > 0) {
169            delimiter[del_idx++] = lexer->lookahead;
170        }
171        lexer->advance(lexer, false);
172
173        // If we run out of space, stop recording the delimiter but keep
174        // advancing the lexer to ensure that we at least parse the marker
175        // correctly. Reserve two bytes: one for the strip indicator and
176        // one for the terminating null byte.
177        if (del_idx >= DEL_SPACE - 2) {
178            del_idx = 0;
179        }
180    }
181
182    if (quote) {
183        if (lexer->lookahead != quote) {
184            return false;
185        }
186        lexer->advance(lexer, false);
187    }
188
189    if (del_idx == 0) {
190        lexer->result_symbol = HEREDOC_MARKER;
191        return true;
192    }
193
194    delimiter[0] = stripping ? '-' : ' ';
195    delimiter[del_idx] = '\0';
196
197    // We copy the delimiter string to the heap here since we can't store our
198    // stack-allocated string in our state (which is stored on the heap).
199    char *del_copy = malloc(del_idx + 1);
200    memcpy(del_copy, delimiter, del_idx + 1);
201
202    if (state->heredoc_count == 0) {
203        state->heredoc_count = 1;
204        state->heredocs[0] = del_copy;
205        state->stripping_heredoc = stripping;
206    } else if (state->heredoc_count >= MAX_HEREDOCS) {
207        free(del_copy);
208    } else {
209        state->heredocs[state->heredoc_count++] = del_copy;
210    }
211
212    lexer->result_symbol = HEREDOC_MARKER;
213    return true;
214}
215
216static bool scan_content(scanner_state *state, TSLexer *lexer,
217                         const bool *valid_symbols) {
218    if (state->heredoc_count == 0) {
219        state->in_heredoc = false;
220        return false;
221    }
222
223    state->in_heredoc = true;
224
225    if (state->stripping_heredoc) {
226        skip_whitespace(lexer);
227    }
228
229    if (valid_symbols[HEREDOC_END]) {
230        unsigned delim_idx = 1;
231        // Look for the current heredoc delimiter.
232        while (state->heredocs[0][delim_idx] != '\0' &&
233               lexer->lookahead != '\0' &&
234               lexer->lookahead == state->heredocs[0][delim_idx]) {
235            lexer->advance(lexer, false);
236            delim_idx++;
237        }
238
239        // Check if the entire string matched.
240        if (state->heredocs[0][delim_idx] == '\0') {
241            lexer->result_symbol = HEREDOC_END;
242
243            // Shift the first heredoc off the list.
244            free(state->heredocs[0]);
245
246            for (unsigned i = 1; i < state->heredoc_count; i++) {
247                state->heredocs[i - 1] = state->heredocs[i];
248            }
249            state->heredocs[state->heredoc_count - 1] = NULL;
250            state->heredoc_count--;
251
252            if (state->heredoc_count > 0) {
253                state->stripping_heredoc = state->heredocs[0][0] == '-';
254            } else {
255                state->in_heredoc = false;
256            }
257
258            return true;
259        }
260    }
261
262    if (!valid_symbols[HEREDOC_LINE])
263        return false;
264
265    lexer->result_symbol = HEREDOC_LINE;
266
267    for (;;) {
268        switch (lexer->lookahead) {
269        case '\0':
270            if (lexer->eof(lexer)) {
271                state->in_heredoc = false;
272                return true;
273            }
274            lexer->advance(lexer, false);
275            break;
276
277        case '\n':
278            return true;
279
280        default:
281            lexer->advance(lexer, false);
282        }
283    }
284}
285
286bool tree_sitter_dockerfile_external_scanner_scan(void *payload, TSLexer *lexer,
287                                                  const bool *valid_symbols) {
288    scanner_state *state = payload;
289
290    if (valid_symbols[ERROR_SENTINEL]) {
291        if (state->in_heredoc) {
292            return scan_content(state, lexer, valid_symbols);
293        } else {
294            return scan_marker(state, lexer);
295        }
296    }
297
298    // HEREDOC_NL only matches a linebreak if there are open heredocs. This is
299    // necessary to avoid a conflict in the grammar since a normal line break
300    // could either be the start of a heredoc or the end of an instruction.
301    if (valid_symbols[HEREDOC_NL]) {
302        if (state->heredoc_count > 0 && lexer->lookahead == '\n') {
303            lexer->result_symbol = HEREDOC_NL;
304            lexer->advance(lexer, false);
305            return true;
306        }
307    }
308
309    if (valid_symbols[HEREDOC_MARKER]) {
310        return scan_marker(state, lexer);
311    }
312
313    if (valid_symbols[HEREDOC_LINE] || valid_symbols[HEREDOC_END]) {
314        return scan_content(state, lexer, valid_symbols);
315    }
316
317    return false;
318}