1#include "common.h"
2#include "peg-parser.h"
3#include "json-schema-to-grammar.h"
4#include "unicode.h"
5
6#include <nlohmann/json.hpp>
7
8#include <algorithm>
9#include <initializer_list>
10#include <map>
11#include <memory>
12#include <regex>
13#include <stdexcept>
14#include <unordered_set>
15
16// Trick to catch missing branches
17template <typename T>
18inline constexpr bool is_always_false_v = false;
19
20const char * common_peg_parse_result_type_name(common_peg_parse_result_type type) {
21 switch (type) {
22 case COMMON_PEG_PARSE_RESULT_FAIL: return "fail";
23 case COMMON_PEG_PARSE_RESULT_SUCCESS: return "success";
24 case COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT: return "need_more_input";
25 default: return "unknown";
26 }
27}
28
29static bool is_hex_digit(const char c) {
30 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
31}
32
33// Trie for matching multiple literals.
34// This is used in common_peg_until_parser and to build a GBNF exclusion grammar
35struct trie {
36 struct node {
37 size_t depth = 0;
38 std::map<unsigned char, size_t> children;
39 bool is_word;
40 };
41
42 std::vector<node> nodes;
43
44 trie(const std::vector<std::string> & words) {
45 create_node(); // root node
46 for (const auto & w : words) {
47 insert(w);
48 }
49 }
50
51 enum match_result { NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH };
52
53 // Check if a delimiter starts at the given position
54 match_result check_at(std::string_view sv, size_t start_pos) const {
55 size_t current = 0; // Start at root
56 size_t pos = start_pos;
57
58 while (pos < sv.size()) {
59 auto it = nodes[current].children.find(sv[pos]);
60 if (it == nodes[current].children.end()) {
61 // Can't continue matching
62 return match_result{match_result::NO_MATCH};
63 }
64
65 current = it->second;
66 pos++;
67
68 // Check if we've matched a complete word
69 if (nodes[current].is_word) {
70 return match_result{match_result::COMPLETE_MATCH};
71 }
72 }
73
74 // Reached end of input while still in the trie (not at root)
75 if (current != 0) {
76 // We're in the middle of a potential match
77 return match_result{match_result::PARTIAL_MATCH};
78 }
79
80 // Reached end at root (no match)
81 return match_result{match_result::NO_MATCH};
82 }
83
84 struct prefix_and_next {
85 std::string prefix;
86 std::string next_chars;
87 };
88
89 std::vector<prefix_and_next> collect_prefix_and_next() {
90 std::string prefix;
91 std::vector<prefix_and_next> result;
92 collect_prefix_and_next(0, prefix, result);
93 return result;
94 }
95
96 private:
97 void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
98 if (!nodes[index].is_word) {
99 if (!nodes[index].children.empty()) {
100 std::string chars;
101 chars.reserve(nodes[index].children.size());
102 for (const auto & p : nodes[index].children) {
103 chars.push_back(p.first);
104 }
105 out.emplace_back(prefix_and_next{prefix, chars});
106 }
107 }
108
109 for (const auto & p : nodes[index].children) {
110 unsigned char ch = p.first;
111 auto child = p.second;
112 prefix.push_back(ch);
113 collect_prefix_and_next(child, prefix, out);
114 prefix.pop_back();
115 }
116 }
117
118 size_t create_node() {
119 size_t index = nodes.size();
120 nodes.emplace_back();
121 return index;
122 }
123
124 void insert(const std::string & word) {
125 size_t current = 0;
126 for (unsigned char ch : word) {
127 auto it = nodes[current].children.find(ch);
128 if (it == nodes[current].children.end()) {
129 size_t child = create_node();
130 nodes[child].depth = nodes[current].depth + 1;
131 nodes[current].children[ch] = child;
132 current = child;
133 } else {
134 current = it->second;
135 }
136 }
137 nodes[current].is_word = true;
138 }
139};
140
141static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
142 if (pos + hex_count > str.length()) {
143 return {0, 0};
144 }
145
146 uint32_t value = 0;
147 for (int i = 0; i < hex_count; i++) {
148 char c = str[pos + i];
149 if (!is_hex_digit(c)) {
150 return {0, 0};
151 }
152 value <<= 4;
153 if ('a' <= c && c <= 'f') {
154 value += c - 'a' + 10;
155 } else if ('A' <= c && c <= 'F') {
156 value += c - 'A' + 10;
157 } else if ('0' <= c && c <= '9') {
158 value += c - '0';
159 } else {
160 break;
161 }
162 }
163 return {value, static_cast<size_t>(hex_count)};
164}
165
166static std::pair<uint32_t, size_t> parse_char_class_char(const std::string & content, size_t pos) {
167 if (content[pos] == '\\' && pos + 1 < content.length()) {
168 switch (content[pos + 1]) {
169 case 'x': {
170 auto result = parse_hex_escape(content, pos + 2, 2);
171 if (result.second > 0) {
172 return {result.first, 2 + result.second};
173 }
174 // Invalid escape, treat as literal 'x'
175 return {static_cast<uint32_t>('x'), 2};
176 }
177 case 'u': {
178 auto result = parse_hex_escape(content, pos + 2, 4);
179 if (result.second > 0) {
180 return {result.first, 2 + result.second};
181 }
182 // Invalid escape, treat as literal 'u'
183 return {static_cast<uint32_t>('u'), 2};
184 }
185 case 'U': {
186 auto result = parse_hex_escape(content, pos + 2, 8);
187 if (result.second > 0) {
188 return {result.first, 2 + result.second};
189 }
190 // Invalid escape, treat as literal 'U'
191 return {static_cast<uint32_t>('U'), 2};
192 }
193 case 'n': return {'\n', 2};
194 case 't': return {'\t', 2};
195 case 'r': return {'\r', 2};
196 case '\\': return {'\\', 2};
197 case ']': return {']', 2};
198 case '[': return {'[', 2};
199 default: return {static_cast<uint32_t>(content[pos + 1]), 2};
200 }
201 }
202
203 // Regular character - return as codepoint
204 return {static_cast<uint32_t>(static_cast<unsigned char>(content[pos])), 1};
205}
206
207static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_char_classes(const std::string & classes) {
208 std::vector<common_peg_chars_parser::char_range> ranges;
209 bool negated = false;
210
211 std::string content = classes;
212 if (content.front() == '[') {
213 content = content.substr(1);
214 }
215
216 if (content.back() == ']') {
217 content.pop_back();
218 }
219
220 // Check for negation
221 if (!content.empty() && content.front() == '^') {
222 negated = true;
223 content = content.substr(1);
224 }
225
226 size_t i = 0;
227 while (i < content.length()) {
228 auto [start, start_len] = parse_char_class_char(content, i);
229 i += start_len;
230
231 if (i + 1 < content.length() && content[i] == '-') {
232 // Range detected
233 auto [end, end_len] = parse_char_class_char(content, i + 1);
234 ranges.push_back(common_peg_chars_parser::char_range{start, end});
235 i += 1 + end_len;
236 } else {
237 ranges.push_back(common_peg_chars_parser::char_range{start, start});
238 }
239 }
240
241 return {ranges, negated};
242}
243
244void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
245 if (id == COMMON_PEG_INVALID_AST_ID) {
246 return;
247 }
248 const auto & node = get(id);
249 visitor(node);
250 for (const auto & child : node.children) {
251 visit(child, visitor);
252 }
253}
254
255void common_peg_ast_arena::visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const {
256 for (const auto & node : result.nodes) {
257 visit(node, visitor);
258 }
259}
260
261struct parser_executor;
262
263common_peg_parser_id common_peg_arena::add_parser(common_peg_parser_variant parser) {
264 common_peg_parser_id id = parsers_.size();
265 parsers_.push_back(std::move(parser));
266 return id;
267}
268
269void common_peg_arena::add_rule(const std::string & name, common_peg_parser_id id) {
270 rules_[name] = id;
271}
272
273common_peg_parser_id common_peg_arena::get_rule(const std::string & name) const {
274 auto it = rules_.find(name);
275 if (it == rules_.end()) {
276 throw std::runtime_error("Rule not found: " + name);
277 }
278 return it->second;
279}
280
281struct parser_executor {
282 const common_peg_arena & arena;
283 common_peg_parse_context & ctx;
284 size_t start_pos;
285
286 parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
287 : arena(arena), ctx(ctx), start_pos(start) {}
288
289 common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
290 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
291 }
292
293 common_peg_parse_result operator()(const common_peg_start_parser & /* p */) const {
294 return common_peg_parse_result(
295 start_pos == 0 ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
296 start_pos
297 );
298 }
299
300 common_peg_parse_result operator()(const common_peg_end_parser & /* p */) const {
301 return common_peg_parse_result(
302 start_pos >= ctx.input.size() ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
303 start_pos
304 );
305 }
306
307 common_peg_parse_result operator()(const common_peg_literal_parser & p) {
308 auto pos = start_pos;
309 for (auto i = 0u; i < p.literal.size(); ++i) {
310 if (pos >= ctx.input.size()) {
311 if (!ctx.is_partial) {
312 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
313 }
314 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
315 }
316 if (ctx.input[pos] != p.literal[i]) {
317 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
318 }
319 ++pos;
320 }
321
322 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
323 }
324
325 common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
326 auto pos = start_pos;
327 std::vector<common_peg_ast_id> nodes;
328
329 for (const auto & child_id : p.children) {
330 auto result = arena.parse(child_id, ctx, pos);
331 if (result.fail()) {
332 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
333 }
334
335 if (!result.nodes.empty()) {
336 nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
337 }
338
339 if (result.need_more_input()) {
340 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
341 }
342
343 pos = result.end;
344 }
345
346 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
347 }
348
349 common_peg_parse_result operator()(const common_peg_choice_parser & p) {
350 auto pos = start_pos;
351 for (const auto & child_id : p.children) {
352 auto result = arena.parse(child_id, ctx, pos);
353 if (!result.fail()) {
354 return result;
355 }
356 }
357
358 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
359 }
360
361 common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
362 auto pos = start_pos;
363 int match_count = 0;
364 std::vector<common_peg_ast_id> nodes;
365
366 // Try to match up to max_count times (or unlimited if max_count is -1)
367 while (p.max_count == -1 || match_count < p.max_count) {
368 if (pos >= ctx.input.size()) {
369 break;
370 }
371
372 auto result = arena.parse(p.child, ctx, pos);
373
374 if (result.success()) {
375 // Prevent infinite loop on empty matches
376 if (result.end == pos) {
377 break;
378 }
379
380 if (!result.nodes.empty()) {
381 nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
382 }
383
384 pos = result.end;
385 match_count++;
386 continue;
387 }
388
389 if (result.need_more_input()) {
390 if (!result.nodes.empty()) {
391 nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
392 }
393
394 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
395 }
396
397 // Child failed - stop trying
398 break;
399 }
400
401 // Check if we got enough matches
402 if (p.min_count > 0 && match_count < p.min_count) {
403 if (pos >= ctx.input.size() && ctx.is_partial) {
404 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
405 }
406 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
407 }
408
409 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
410 }
411
412 common_peg_parse_result operator()(const common_peg_and_parser & p) {
413 auto result = arena.parse(p.child, ctx, start_pos);
414 // Pass result but don't consume input
415 return common_peg_parse_result(result.type, start_pos);
416 }
417
418 common_peg_parse_result operator()(const common_peg_not_parser & p) {
419 auto result = arena.parse(p.child, ctx, start_pos);
420
421 if (result.success()) {
422 // Fail if the underlying parser matches
423 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
424 }
425
426 if (result.need_more_input()) {
427 // Propagate - need to know what child would match before negating
428 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
429 }
430
431 // Child failed, so negation succeeds
432 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
433 }
434
435 common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
436 // Parse a single UTF-8 codepoint (not just a single byte)
437 auto result = parse_utf8_codepoint(ctx.input, start_pos);
438
439 if (result.status == utf8_parse_result::INCOMPLETE) {
440 if (!ctx.is_partial) {
441 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
442 }
443 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
444 }
445 if (result.status == utf8_parse_result::INVALID) {
446 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
447 }
448 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, start_pos + result.bytes_consumed);
449 }
450
451 common_peg_parse_result operator()(const common_peg_space_parser & /* p */) {
452 auto pos = start_pos;
453 while (pos < ctx.input.size()) {
454 auto c = static_cast<unsigned char>(ctx.input[pos]);
455 if (std::isspace(c)) {
456 ++pos;
457 } else {
458 break;
459 }
460 }
461
462 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
463 }
464
465 common_peg_parse_result operator()(const common_peg_chars_parser & p) const {
466 auto pos = start_pos;
467 int match_count = 0;
468
469 // Try to match up to max_count times (or unlimited if max_count is -1)
470 while (p.max_count == -1 || match_count < p.max_count) {
471 auto result = parse_utf8_codepoint(ctx.input, pos);
472
473 if (result.status == utf8_parse_result::INCOMPLETE) {
474 if (match_count >= p.min_count) {
475 // We have enough matches, succeed with what we have
476 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
477 }
478 // Not enough matches yet
479 if (!ctx.is_partial) {
480 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
481 }
482 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
483 }
484
485 if (result.status == utf8_parse_result::INVALID) {
486 // Malformed UTF-8 in input
487 if (match_count >= p.min_count) {
488 // We have enough matches, succeed up to here
489 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
490 }
491 // Not enough matches, fail
492 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
493 }
494
495 // Check if this codepoint matches our character class
496 bool matches = false;
497 for (const auto & range : p.ranges) {
498 if (range.contains(result.codepoint)) {
499 matches = true;
500 break;
501 }
502 }
503
504 // If negated, invert the match result
505 if (p.negated) {
506 matches = !matches;
507 }
508
509 if (matches) {
510 pos += result.bytes_consumed;
511 ++match_count;
512 } else {
513 // Character doesn't match, stop matching
514 break;
515 }
516 }
517
518 // Check if we got enough matches
519 if (match_count < p.min_count) {
520 if (pos >= ctx.input.size() && ctx.is_partial) {
521 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
522 }
523 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
524 }
525
526 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
527 }
528
529 static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
530 ++pos; // consume '\'
531 if (pos >= ctx.input.size()) {
532 if (!ctx.is_partial) {
533 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
534 }
535 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
536 }
537
538 switch (ctx.input[pos]) {
539 case '"':
540 case '\\':
541 case '/':
542 case 'b':
543 case 'f':
544 case 'n':
545 case 'r':
546 case 't':
547 ++pos;
548 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
549 case 'u':
550 return handle_unicode_escape(ctx, start, pos);
551 default:
552 // Invalid escape sequence
553 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
554 }
555 }
556
557 static common_peg_parse_result handle_unicode_escape(common_peg_parse_context & ctx, size_t start, size_t & pos) {
558 ++pos; // consume 'u'
559 for (int i = 0; i < 4; ++i) {
560 if (pos >= ctx.input.size()) {
561 if (!ctx.is_partial) {
562 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
563 }
564 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
565 }
566 if (!is_hex_digit(ctx.input[pos])) {
567 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
568 }
569 ++pos;
570 }
571 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
572 }
573
574 common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
575 auto pos = start_pos;
576
577 // Parse string content (without quotes)
578 while (pos < ctx.input.size()) {
579 char c = ctx.input[pos];
580
581 if (c == '"') {
582 // Found closing quote - success (don't consume it)
583 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
584 }
585
586 if (c == '\\') {
587 auto result = handle_escape_sequence(ctx, start_pos, pos);
588 if (!result.success()) {
589 return result;
590 }
591 } else {
592 auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
593
594 if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
595 if (!ctx.is_partial) {
596 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
597 }
598 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
599 }
600
601 if (utf8_result.status == utf8_parse_result::INVALID) {
602 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
603 }
604
605 pos += utf8_result.bytes_consumed;
606 }
607 }
608
609 // Reached end without finding closing quote
610 if (!ctx.is_partial) {
611 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
612 }
613 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
614 }
615
616 common_peg_parse_result operator()(const common_peg_until_parser & p) const {
617 trie matcher(p.delimiters);
618
619 // Scan input and check for delimiters
620 size_t pos = start_pos;
621 size_t last_valid_pos = start_pos;
622
623 while (pos < ctx.input.size()) {
624 auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
625
626 if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
627 // Incomplete UTF-8 sequence
628 if (!ctx.is_partial) {
629 // Input is complete but UTF-8 is incomplete = malformed
630 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
631 }
632 // Return what we have so far (before incomplete sequence)
633 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
634 }
635
636 if (utf8_result.status == utf8_parse_result::INVALID) {
637 // Malformed UTF-8
638 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
639 }
640
641 // Check if a delimiter starts at this position
642 auto match = matcher.check_at(ctx.input, pos);
643
644 if (match == trie::COMPLETE_MATCH) {
645 // Found a complete delimiter, return everything before it
646 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
647 }
648
649 if (match == trie::PARTIAL_MATCH) {
650 // Found a partial match extending to end of input, return everything before it
651 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
652 }
653
654 pos += utf8_result.bytes_consumed;
655 last_valid_pos = pos;
656 }
657
658 if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
659 // Reached the end of a partial stream, there might still be more input that we need to consume.
660 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
661 }
662 return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, last_valid_pos);
663 }
664
665 common_peg_parse_result operator()(const common_peg_schema_parser & p) {
666 return arena.parse(p.child, ctx, start_pos);
667 }
668
669 common_peg_parse_result operator()(const common_peg_rule_parser & p) {
670 // Parse the child
671 auto result = arena.parse(p.child, ctx, start_pos);
672
673 if (!result.fail()) {
674 std::string_view text;
675 if (result.start < ctx.input.size()) {
676 text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
677 }
678
679 auto node_id = ctx.ast.add_node(
680 p.name,
681 "",
682 result.start,
683 result.end,
684 text,
685 std::move(result.nodes),
686 result.need_more_input()
687 );
688
689 return common_peg_parse_result(result.type, result.start, result.end, { node_id });
690 }
691
692 return result;
693 }
694
695 common_peg_parse_result operator()(const common_peg_tag_parser & p) {
696 // Parse the child
697 auto result = arena.parse(p.child, ctx, start_pos);
698
699 if (!result.fail()) {
700 std::string_view text;
701 if (result.start < ctx.input.size()) {
702 text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
703 }
704
705 auto node_id = ctx.ast.add_node(
706 "",
707 p.tag,
708 result.start,
709 result.end,
710 text,
711 std::move(result.nodes),
712 result.need_more_input()
713 );
714
715 return common_peg_parse_result(result.type, result.start, result.end, { node_id });
716 }
717
718 return result;
719 }
720
721 common_peg_parse_result operator()(const common_peg_ref_parser & p) {
722 auto rule_id = arena.get_rule(p.name);
723 return arena.parse(rule_id, ctx, start_pos);
724 }
725
726 common_peg_parse_result operator()(const common_peg_atomic_parser & p) {
727 auto result = arena.parse(p.child, ctx, start_pos);
728 if (result.need_more_input()) {
729 // Clear nodes so they don't propagate up.
730 result.nodes.clear();
731 }
732 return result;
733 }
734};
735
736common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
737 if (root_ == COMMON_PEG_INVALID_PARSER_ID) {
738 throw std::runtime_error("No root parser set");
739 }
740 return parse(root_, ctx, start);
741}
742
743common_peg_parse_result common_peg_arena::parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const {
744 // Execute parser
745 const auto & parser = parsers_.at(id);
746 parser_executor exec(*this, ctx, start);
747 return std::visit(exec, parser);
748}
749
750common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
751 const auto & parser = parsers_.at(id);
752 if (auto ref = std::get_if<common_peg_ref_parser>(&parser)) {
753 return get_rule(ref->name);
754 }
755 return id;
756}
757
758void common_peg_arena::resolve_refs() {
759 // Walk through all parsers and replace refs with their corresponding rule IDs
760 for (auto & parser : parsers_) {
761 std::visit([this](auto & p) {
762 using T = std::decay_t<decltype(p)>;
763
764 if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
765 for (auto & child : p.children) {
766 child = resolve_ref(child);
767 }
768 } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
769 for (auto & child : p.children) {
770 child = resolve_ref(child);
771 }
772 } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
773 std::is_same_v<T, common_peg_and_parser> ||
774 std::is_same_v<T, common_peg_not_parser> ||
775 std::is_same_v<T, common_peg_tag_parser> ||
776 std::is_same_v<T, common_peg_atomic_parser>) {
777 p.child = resolve_ref(p.child);
778 } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
779 p.child = resolve_ref(p.child);
780 } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
781 p.child = resolve_ref(p.child);
782 } else if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
783 std::is_same_v<T, common_peg_start_parser> ||
784 std::is_same_v<T, common_peg_end_parser> ||
785 std::is_same_v<T, common_peg_ref_parser> ||
786 std::is_same_v<T, common_peg_until_parser> ||
787 std::is_same_v<T, common_peg_literal_parser> ||
788 std::is_same_v<T, common_peg_json_string_parser> ||
789 std::is_same_v<T, common_peg_chars_parser> ||
790 std::is_same_v<T, common_peg_any_parser> ||
791 std::is_same_v<T, common_peg_space_parser>) {
792 // These rules do not have children
793 } else {
794 static_assert(is_always_false_v<T>);
795 }
796 }, parser);
797 }
798
799 // Also flatten root if it's a ref
800 if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
801 root_ = resolve_ref(root_);
802 }
803}
804
805std::string common_peg_arena::dump(common_peg_parser_id id) const {
806 const auto & parser = parsers_.at(id);
807
808 return std::visit([this](const auto & p) -> std::string {
809 using T = std::decay_t<decltype(p)>;
810
811 if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
812 return "Epsilon";
813 } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
814 return "Start";
815 } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
816 return "End";
817 } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
818 return "Literal(" + p.literal + ")";
819 } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
820 std::vector<std::string> parts;
821 for (const auto & child : p.children) {
822 parts.push_back(dump(child));
823 }
824 return "Sequence(" + string_join(parts, ", ") + ")";
825 } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
826 std::vector<std::string> parts;
827 for (const auto & child : p.children) {
828 parts.push_back(dump(child));
829 }
830 return "Choice(" + string_join(parts, ", ") + ")";
831 } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
832 if (p.max_count == -1) {
833 return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
834 }
835 return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
836 } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
837 return "And(" + dump(p.child) + ")";
838 } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
839 return "Not(" + dump(p.child) + ")";
840 } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
841 return "Any";
842 } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
843 return "Space";
844 } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
845 if (p.max_count == -1) {
846 return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
847 }
848 return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
849 } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
850 return "JsonString()";
851 } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
852 return "Until(" + string_join(p.delimiters, " | ") + ")";
853 } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
854 return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
855 } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
856 return "Rule(" + p.name + ", " + dump(p.child) + ")";
857 } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
858 return "Ref(" + p.name + ")";
859 } else {
860 return "Unknown";
861 }
862 }, parser);
863}
864
865common_peg_parser & common_peg_parser::operator=(const common_peg_parser & other) {
866 id_ = other.id_;
867 return *this;
868}
869
870common_peg_parser & common_peg_parser::operator+=(const common_peg_parser & other) {
871 id_ = builder_.sequence({id_, other.id_});
872 return *this;
873}
874
875common_peg_parser & common_peg_parser::operator|=(const common_peg_parser & other) {
876 id_ = builder_.choice({id_, other.id_});
877 return *this;
878}
879
880common_peg_parser common_peg_parser::operator+(const common_peg_parser & other) const {
881 return builder_.sequence({id_, other.id_});
882}
883
884common_peg_parser common_peg_parser::operator|(const common_peg_parser & other) const {
885 return builder_.choice({id_, other.id_});
886}
887
888common_peg_parser common_peg_parser::operator<<(const common_peg_parser & other) const {
889 return builder_.sequence({id_, builder_.space(), other.id_});
890}
891
892common_peg_parser common_peg_parser::operator+(const char * str) const {
893 return *this + builder_.literal(str);
894}
895
896common_peg_parser common_peg_parser::operator+(const std::string & str) const {
897 return *this + builder_.literal(str);
898}
899
900common_peg_parser common_peg_parser::operator<<(const char * str) const {
901 return *this << builder_.literal(str);
902}
903
904common_peg_parser common_peg_parser::operator<<(const std::string & str) const {
905 return *this << builder_.literal(str);
906}
907
908common_peg_parser common_peg_parser::operator|(const char * str) const {
909 return *this | builder_.literal(str);
910}
911
912common_peg_parser common_peg_parser::operator|(const std::string & str) const {
913 return *this | builder_.literal(str);
914}
915
916common_peg_parser operator+(const char * str, const common_peg_parser & p) {
917 return p.builder().literal(str) + p;
918}
919
920common_peg_parser operator+(const std::string & str, const common_peg_parser & p) {
921 return operator+(str.c_str(), p);
922}
923
924common_peg_parser operator<<(const char * str, const common_peg_parser & p) {
925 return p.builder().literal(str) << p;
926}
927
928common_peg_parser operator<<(const std::string & str, const common_peg_parser & p) {
929 return operator<<(str.c_str(), p);
930}
931
932common_peg_parser operator|(const char * str, const common_peg_parser & p) {
933 return p.builder().literal(str) | p;
934}
935
936common_peg_parser operator|(const std::string & str, const common_peg_parser & p) {
937 return operator|(str.c_str(), p);
938}
939
940static std::string rule_name(const std::string & name) {
941 static const std::regex invalid_rule_chars_re("[^a-zA-Z0-9-]+");
942 return std::regex_replace(name, invalid_rule_chars_re, "-");
943}
944
945common_peg_parser_builder::common_peg_parser_builder() {}
946
947common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser_id> & parsers) {
948 // Flatten nested sequences
949 std::vector<common_peg_parser_id> flattened;
950 for (const auto & p : parsers) {
951 const auto & parser = arena_.get(p);
952 if (auto seq = std::get_if<common_peg_sequence_parser>(&parser)) {
953 flattened.insert(flattened.end(), seq->children.begin(), seq->children.end());
954 } else {
955 flattened.push_back(p);
956 }
957 }
958 return wrap(arena_.add_parser(common_peg_sequence_parser{flattened}));
959}
960
961common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser> & parsers) {
962 std::vector<common_peg_parser_id> ids;
963 ids.reserve(parsers.size());
964 for (const auto & p : parsers) {
965 ids.push_back(p.id());
966 }
967 return sequence(ids);
968}
969
970common_peg_parser common_peg_parser_builder::sequence(std::initializer_list<common_peg_parser> parsers) {
971 std::vector<common_peg_parser_id> ids;
972 ids.reserve(parsers.size());
973 for (const auto & p : parsers) {
974 ids.push_back(p.id());
975 }
976 return sequence(ids);
977}
978
979common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser_id> & parsers) {
980 // Flatten nested choices
981 std::vector<common_peg_parser_id> flattened;
982 for (const auto & p : parsers) {
983 const auto & parser = arena_.get(p);
984 if (auto choice = std::get_if<common_peg_choice_parser>(&parser)) {
985 flattened.insert(flattened.end(), choice->children.begin(), choice->children.end());
986 } else {
987 flattened.push_back(p);
988 }
989 }
990 return wrap(arena_.add_parser(common_peg_choice_parser{flattened}));
991}
992
993common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser> & parsers) {
994 std::vector<common_peg_parser_id> ids;
995 ids.reserve(parsers.size());
996 for (const auto & p : parsers) {
997 ids.push_back(p.id());
998 }
999 return choice(ids);
1000}
1001
1002common_peg_parser common_peg_parser_builder::choice(std::initializer_list<common_peg_parser> parsers) {
1003 std::vector<common_peg_parser_id> ids;
1004 ids.reserve(parsers.size());
1005 for (const auto & p : parsers) {
1006 ids.push_back(p.id());
1007 }
1008 return choice(ids);
1009}
1010
1011common_peg_parser common_peg_parser_builder::chars(const std::string & classes, int min, int max) {
1012 auto [ranges, negated] = parse_char_classes(classes);
1013 return wrap(arena_.add_parser(common_peg_chars_parser{classes, ranges, negated, min, max}));
1014}
1015
1016common_peg_parser common_peg_parser_builder::schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw) {
1017 return wrap(arena_.add_parser(common_peg_schema_parser{p.id(), name, std::make_shared<nlohmann::ordered_json>(schema), raw}));
1018}
1019
1020common_peg_parser common_peg_parser_builder::rule(const std::string & name, const common_peg_parser & p, bool trigger) {
1021 auto clean_name = rule_name(name);
1022 auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, p.id(), trigger});
1023 arena_.add_rule(clean_name, rule_id);
1024 return ref(clean_name);
1025}
1026
1027common_peg_parser common_peg_parser_builder::rule(const std::string & name, const std::function<common_peg_parser()> & builder_fn, bool trigger) {
1028 auto clean_name = rule_name(name);
1029 if (arena_.has_rule(clean_name)) {
1030 return ref(clean_name);
1031 }
1032
1033 // Create placeholder rule to allow recursive references
1034 auto placeholder = any(); // Temporary placeholder
1035 auto placeholder_rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, placeholder.id(), trigger});
1036 arena_.add_rule(clean_name, placeholder_rule_id);
1037
1038 // Build the actual parser
1039 auto parser = builder_fn();
1040
1041 // Replace placeholder with actual rule
1042 auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, parser.id(), trigger});
1043 arena_.rules_[clean_name] = rule_id;
1044
1045 return ref(clean_name);
1046}
1047
1048void common_peg_parser_builder::set_root(const common_peg_parser & p) {
1049 arena_.set_root(p.id());
1050}
1051
1052common_peg_arena common_peg_parser_builder::build() {
1053 arena_.resolve_refs();
1054 return std::move(arena_);
1055}
1056
1057// JSON parsers
1058common_peg_parser common_peg_parser_builder::json_number() {
1059 return rule("json-number", [this]() {
1060 auto digit1_9 = chars("[1-9]", 1, 1);
1061 auto digits = chars("[0-9]");
1062 auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
1063 auto frac = sequence({literal("."), digits});
1064 auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
1065 return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
1066 });
1067}
1068
1069common_peg_parser common_peg_parser_builder::json_string() {
1070 return rule("json-string", [this]() {
1071 return sequence({literal("\""), json_string_content(), literal("\""), space()});
1072 });
1073}
1074
1075common_peg_parser common_peg_parser_builder::json_bool() {
1076 return rule("json-bool", [this]() {
1077 return sequence({choice({literal("true"), literal("false")}), space()});
1078 });
1079}
1080
1081common_peg_parser common_peg_parser_builder::json_null() {
1082 return rule("json-null", [this]() {
1083 return sequence({literal("null"), space()});
1084 });
1085}
1086
1087common_peg_parser common_peg_parser_builder::json_object() {
1088 return rule("json-object", [this]() {
1089 auto ws = space();
1090 auto member = sequence({json_string(), ws, literal(":"), ws, json()});
1091 auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
1092 return sequence({
1093 literal("{"),
1094 ws,
1095 choice({
1096 literal("}"),
1097 sequence({members, ws, literal("}")})
1098 }),
1099 ws
1100 });
1101 });
1102}
1103
1104common_peg_parser common_peg_parser_builder::json_array() {
1105 return rule("json-array", [this]() {
1106 auto ws = space();
1107 auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
1108 return sequence({
1109 literal("["),
1110 ws,
1111 choice({
1112 literal("]"),
1113 sequence({elements, ws, literal("]")})
1114 }),
1115 ws
1116 });
1117 });
1118}
1119
1120common_peg_parser common_peg_parser_builder::json() {
1121 return rule("json-value", [this]() {
1122 return choice({
1123 json_object(),
1124 json_array(),
1125 json_string(),
1126 json_number(),
1127 json_bool(),
1128 json_null()
1129 });
1130 });
1131}
1132
1133common_peg_parser common_peg_parser_builder::json_string_content() {
1134 return wrap(arena_.add_parser(common_peg_json_string_parser{}));
1135}
1136
1137common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
1138 auto ws = space();
1139 return sequence({
1140 literal("\"" + key + "\""),
1141 ws,
1142 literal(":"),
1143 ws,
1144 p,
1145 });
1146}
1147
1148
1149static std::string gbnf_escape_char_class(char c) {
1150 switch (c) {
1151 case '\n': return "\\n";
1152 case '\t': return "\\t";
1153 case '\r': return "\\r";
1154 case '\\': return "\\\\";
1155 case ']': return "\\]";
1156 case '[': return "\\[";
1157 default: return std::string(1, c);
1158 }
1159}
1160
1161static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
1162 trie matcher(strings);
1163 auto pieces = matcher.collect_prefix_and_next();
1164
1165 std::string pattern;
1166 for (size_t i = 0; i < pieces.size(); ++i) {
1167 if (i > 0) {
1168 pattern += " | ";
1169 }
1170
1171 const auto & pre = pieces[i].prefix;
1172 const auto & chars = pieces[i].next_chars;
1173
1174 std::string cls;
1175 cls.reserve(chars.size());
1176 for (const auto & ch : chars) {
1177 cls += gbnf_escape_char_class(ch);
1178 }
1179
1180 if (!pre.empty()) {
1181 pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
1182 } else {
1183 pattern += "[^" + cls + "]";
1184 }
1185 }
1186
1187 return "(" + pattern + ")*";
1188}
1189
1190static std::unordered_set<std::string> collect_reachable_rules(
1191 const common_peg_arena & arena,
1192 const common_peg_parser_id & rule
1193) {
1194 std::unordered_set<std::string> reachable;
1195 std::unordered_set<std::string> visited;
1196
1197 std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
1198 const auto & parser = arena.get(id);
1199
1200 std::visit([&](const auto & p) {
1201 using T = std::decay_t<decltype(p)>;
1202
1203 if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
1204 std::is_same_v<T, common_peg_start_parser> ||
1205 std::is_same_v<T, common_peg_end_parser> ||
1206 std::is_same_v<T, common_peg_until_parser> ||
1207 std::is_same_v<T, common_peg_literal_parser> ||
1208 std::is_same_v<T, common_peg_chars_parser> ||
1209 std::is_same_v<T, common_peg_space_parser> ||
1210 std::is_same_v<T, common_peg_any_parser> ||
1211 std::is_same_v<T, common_peg_json_string_parser>) {
1212 // These parsers do not have any children
1213 } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
1214 for (auto child : p.children) {
1215 visit(child);
1216 }
1217 } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
1218 for (auto child : p.children) {
1219 visit(child);
1220 }
1221 } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
1222 std::is_same_v<T, common_peg_and_parser> ||
1223 std::is_same_v<T, common_peg_not_parser> ||
1224 std::is_same_v<T, common_peg_tag_parser> ||
1225 std::is_same_v<T, common_peg_atomic_parser> ||
1226 std::is_same_v<T, common_peg_schema_parser>) {
1227 visit(p.child);
1228 } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
1229 if (visited.find(p.name) == visited.end()) {
1230 visited.insert(p.name);
1231 reachable.insert(p.name);
1232 visit(p.child);
1233 }
1234 } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
1235 // Traverse rules so we pick up everything
1236 auto referenced_rule = arena.get_rule(p.name);
1237 visit(referenced_rule);
1238 } else {
1239 static_assert(is_always_false_v<T>);
1240 }
1241 }, parser);
1242 };
1243
1244 visit(rule);
1245 return reachable;
1246}
1247
1248// GBNF generation implementation
1249void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
1250 // Generate GBNF for a parser
1251 std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
1252 const auto & parser = parsers_.at(id);
1253
1254 return std::visit([&](const auto & p) -> std::string {
1255 using T = std::decay_t<decltype(p)>;
1256
1257 if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
1258 std::is_same_v<T, common_peg_start_parser> ||
1259 std::is_same_v<T, common_peg_end_parser>) {
1260 return "";
1261 } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
1262 return gbnf_format_literal(p.literal);
1263 } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
1264 std::string s;
1265 for (const auto & child : p.children) {
1266 if (!s.empty()) {
1267 s += " ";
1268 }
1269 auto child_gbnf = to_gbnf(child);
1270 const auto & child_parser = parsers_.at(child);
1271 if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
1272 std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
1273 s += "(" + child_gbnf + ")";
1274 } else {
1275 s += child_gbnf;
1276 }
1277 }
1278 return s;
1279 } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
1280 std::string s;
1281 for (const auto & child : p.children) {
1282 if (!s.empty()) {
1283 s += " | ";
1284 }
1285 auto child_gbnf = to_gbnf(child);
1286 const auto & child_parser = parsers_.at(child);
1287 if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
1288 s += "(" + child_gbnf + ")";
1289 } else {
1290 s += child_gbnf;
1291 }
1292 }
1293 return s;
1294 } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
1295 auto child_gbnf = to_gbnf(p.child);
1296 const auto & child_parser = parsers_.at(p.child);
1297 if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
1298 std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
1299 child_gbnf = "(" + child_gbnf + ")";
1300 }
1301 if (p.min_count == 0 && p.max_count == 1) {
1302 return child_gbnf + "?";
1303 }
1304 if (p.min_count == 0 && p.max_count == -1) {
1305 return child_gbnf + "*";
1306 }
1307 if (p.min_count == 1 && p.max_count == -1) {
1308 return child_gbnf + "+";
1309 }
1310 if (p.max_count == -1) {
1311 return child_gbnf + "{" + std::to_string(p.min_count) + ",}";
1312 }
1313 if (p.min_count == p.max_count) {
1314 if (p.min_count == 1) {
1315 return child_gbnf;
1316 }
1317 return child_gbnf + "{" + std::to_string(p.min_count) + "}";
1318 }
1319 return child_gbnf + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
1320 } else if constexpr (std::is_same_v<T, common_peg_and_parser> || std::is_same_v<T, common_peg_not_parser>) {
1321 return ""; // Lookahead not supported in GBNF
1322 } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
1323 return ".";
1324 } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
1325 return "space";
1326 } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
1327 std::string result = p.pattern;
1328 if (p.min_count == 0 && p.max_count == 1) {
1329 return result + "?";
1330 }
1331 if (p.min_count == 0 && p.max_count == -1) {
1332 return result + "*";
1333 }
1334 if (p.min_count == 1 && p.max_count == -1) {
1335 return result + "+";
1336 }
1337 if (p.max_count == -1) {
1338 return result + "{" + std::to_string(p.min_count) + ",}";
1339 }
1340 if (p.min_count == p.max_count) {
1341 if (p.min_count == 1) {
1342 return result;
1343 }
1344 return result + "{" + std::to_string(p.min_count) + "}";
1345 }
1346 return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
1347 } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
1348 return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
1349 } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
1350 if (p.delimiters.empty()) {
1351 return ".*";
1352 }
1353 return gbnf_excluding_pattern(p.delimiters);
1354 } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
1355 if (p.schema) {
1356 if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
1357 // TODO: Implement more comprehensive grammar generation for raw strings.
1358 // For now, use the grammar emitted from the underlying parser.
1359 return to_gbnf(p.child);
1360 }
1361 return builder.add_schema(p.name, *p.schema);
1362 }
1363 return to_gbnf(p.child);
1364 } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
1365 return p.name;
1366 } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
1367 // Refs should not exist after flattening, but kept just in case
1368 return p.name;
1369 } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
1370 return to_gbnf(p.child);
1371 } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
1372 return to_gbnf(p.child);
1373 } else {
1374 static_assert(is_always_false_v<T>);
1375 }
1376 }, parser);
1377 };
1378
1379 // Collect reachable rules
1380 std::unordered_set<std::string> reachable_rules;
1381
1382 if (lazy) {
1383 // Collect rules reachable from trigger rules
1384 for (const auto & [name, id] : rules_) {
1385 const auto & parser = parsers_.at(id);
1386 if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
1387 if (rule->trigger) {
1388 // Mark trigger as reachable and visit it
1389 reachable_rules.insert(name);
1390 auto add_rules = collect_reachable_rules(*this, id);
1391 reachable_rules.insert(add_rules.begin(), add_rules.end());
1392 }
1393 }
1394 }
1395 } else {
1396 // Collect rules reachable from root
1397 reachable_rules = collect_reachable_rules(*this, root_);
1398 }
1399
1400 // Create GBNF rules for all reachable rules
1401 for (const auto & [name, rule_id] : rules_) {
1402 if (reachable_rules.find(name) == reachable_rules.end()) {
1403 continue;
1404 }
1405
1406 const auto & parser = parsers_.at(rule_id);
1407 if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
1408 builder.add_rule(rule->name, to_gbnf(rule->child));
1409 }
1410 }
1411
1412 if (lazy) {
1413 // Generate root rule from trigger rules only
1414 std::vector<std::string> trigger_names;
1415 for (const auto & [name, rule_id] : rules_) {
1416 const auto & parser = parsers_.at(rule_id);
1417 if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
1418 if (rule->trigger) {
1419 trigger_names.push_back(rule->name);
1420 }
1421 }
1422 }
1423
1424 // Sort for predictable order
1425 std::sort(trigger_names.begin(), trigger_names.end());
1426 builder.add_rule("root", string_join(trigger_names, " | "));
1427 } else if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
1428 builder.add_rule("root", to_gbnf(root_));
1429 }
1430}
1431
1432static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & variant) {
1433 using json = nlohmann::json;
1434
1435 return std::visit([](const auto & p) -> json {
1436 using T = std::decay_t<decltype(p)>;
1437
1438 if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
1439 return json{{"type", "epsilon"}};
1440 } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
1441 return json{{"type", "start"}};
1442 } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
1443 return json{{"type", "end"}};
1444 } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
1445 return json{{"type", "literal"}, {"literal", p.literal}};
1446 } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
1447 return json{{"type", "sequence"}, {"children", p.children}};
1448 } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
1449 return json{{"type", "choice"}, {"children", p.children}};
1450 } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
1451 return json{
1452 {"type", "repetition"},
1453 {"child", p.child},
1454 {"min_count", p.min_count},
1455 {"max_count", p.max_count}
1456 };
1457 } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
1458 return json{{"type", "and"}, {"child", p.child}};
1459 } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
1460 return json{{"type", "not"}, {"child", p.child}};
1461 } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
1462 return json{{"type", "any"}};
1463 } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
1464 return json{{"type", "space"}};
1465 } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
1466 json ranges = json::array();
1467 for (const auto & range : p.ranges) {
1468 ranges.push_back({{"start", range.start}, {"end", range.end}});
1469 }
1470 return json{
1471 {"type", "chars"},
1472 {"pattern", p.pattern},
1473 {"ranges", ranges},
1474 {"negated", p.negated},
1475 {"min_count", p.min_count},
1476 {"max_count", p.max_count}
1477 };
1478 } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
1479 return json{{"type", "json_string"}};
1480 } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
1481 return json{{"type", "until"}, {"delimiters", p.delimiters}};
1482 } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
1483 return json{
1484 {"type", "schema"},
1485 {"child", p.child},
1486 {"name", p.name},
1487 {"schema", p.schema ? *p.schema : nullptr},
1488 {"raw", p.raw}
1489 };
1490 } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
1491 return json{
1492 {"type", "rule"},
1493 {"name", p.name},
1494 {"child", p.child},
1495 {"trigger", p.trigger}
1496 };
1497 } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
1498 return json{{"type", "ref"}, {"name", p.name}};
1499 } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
1500 return json{{"type", "atomic"}, {"child", p.child}};
1501 } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
1502 return json{
1503 {"type", "tag"},
1504 {"child", p.child},
1505 {"tag", p.tag}
1506 };
1507 }
1508 }, variant);
1509}
1510
1511nlohmann::json common_peg_arena::to_json() const {
1512 auto parsers = nlohmann::json::array();
1513 for (const auto & parser : parsers_) {
1514 parsers.push_back(serialize_parser_variant(parser));
1515 }
1516 return nlohmann::json{
1517 {"parsers", parsers},
1518 {"rules", rules_},
1519 {"root", root_}
1520 };
1521}
1522
1523static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json & j) {
1524 if (!j.contains("type") || !j["type"].is_string()) {
1525 throw std::runtime_error("Parser variant JSON missing or invalid 'type' field");
1526 }
1527
1528 std::string type = j["type"];
1529
1530 if (type == "epsilon") {
1531 return common_peg_epsilon_parser{};
1532 }
1533 if (type == "start") {
1534 return common_peg_start_parser{};
1535 }
1536 if (type == "end") {
1537 return common_peg_end_parser{};
1538 }
1539 if (type == "literal") {
1540 if (!j.contains("literal") || !j["literal"].is_string()) {
1541 throw std::runtime_error("literal parser missing or invalid 'literal' field");
1542 }
1543 return common_peg_literal_parser{j["literal"]};
1544 }
1545 if (type == "sequence") {
1546 if (!j.contains("children") || !j["children"].is_array()) {
1547 throw std::runtime_error("sequence parser missing or invalid 'children' field");
1548 }
1549 return common_peg_sequence_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
1550 }
1551 if (type == "choice") {
1552 if (!j.contains("children") || !j["children"].is_array()) {
1553 throw std::runtime_error("choice parser missing or invalid 'children' field");
1554 }
1555 return common_peg_choice_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
1556 }
1557 if (type == "repetition") {
1558 if (!j.contains("child") || !j.contains("min_count") || !j.contains("max_count")) {
1559 throw std::runtime_error("repetition parser missing required fields");
1560 }
1561 return common_peg_repetition_parser{
1562 j["child"].get<common_peg_parser_id>(),
1563 j["min_count"].get<int>(),
1564 j["max_count"].get<int>()
1565 };
1566 }
1567 if (type == "and") {
1568 if (!j.contains("child")) {
1569 throw std::runtime_error("and parser missing 'child' field");
1570 }
1571 return common_peg_and_parser{j["child"].get<common_peg_parser_id>()};
1572 }
1573 if (type == "not") {
1574 if (!j.contains("child")) {
1575 throw std::runtime_error("not parser missing 'child' field");
1576 }
1577 return common_peg_not_parser{j["child"].get<common_peg_parser_id>()};
1578 }
1579 if (type == "any") {
1580 return common_peg_any_parser{};
1581 }
1582 if (type == "space") {
1583 return common_peg_space_parser{};
1584 }
1585 if (type == "chars") {
1586 if (!j.contains("pattern") || !j.contains("ranges") || !j.contains("negated") ||
1587 !j.contains("min_count") || !j.contains("max_count")) {
1588 throw std::runtime_error("chars parser missing required fields");
1589 }
1590 common_peg_chars_parser parser;
1591 parser.pattern = j["pattern"];
1592 parser.negated = j["negated"];
1593 parser.min_count = j["min_count"];
1594 parser.max_count = j["max_count"];
1595 for (const auto & range_json : j["ranges"]) {
1596 if (!range_json.contains("start") || !range_json.contains("end")) {
1597 throw std::runtime_error("char_range missing 'start' or 'end' field");
1598 }
1599 parser.ranges.push_back({
1600 range_json["start"].get<uint32_t>(),
1601 range_json["end"].get<uint32_t>()
1602 });
1603 }
1604 return parser;
1605 }
1606 if (type == "json_string") {
1607 return common_peg_json_string_parser{};
1608 }
1609 if (type == "until") {
1610 if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
1611 throw std::runtime_error("until parser missing or invalid 'delimiters' field");
1612 }
1613 return common_peg_until_parser{j["delimiters"].get<std::vector<std::string>>()};
1614 }
1615 if (type == "schema") {
1616 if (!j.contains("child") || !j.contains("name") || !j.contains("schema") || !j.contains("raw")) {
1617 throw std::runtime_error("schema parser missing required fields");
1618 }
1619 common_peg_schema_parser parser;
1620 parser.child = j["child"].get<common_peg_parser_id>();
1621 parser.name = j["name"];
1622 if (!j["schema"].is_null()) {
1623 parser.schema = std::make_shared<nlohmann::ordered_json>(j["schema"]);
1624 }
1625 parser.raw = j["raw"].get<bool>();
1626 return parser;
1627 }
1628 if (type == "rule") {
1629 if (!j.contains("name") || !j.contains("child") || !j.contains("trigger")) {
1630 throw std::runtime_error("rule parser missing required fields");
1631 }
1632 return common_peg_rule_parser{
1633 j["name"].get<std::string>(),
1634 j["child"].get<common_peg_parser_id>(),
1635 j["trigger"].get<bool>()
1636 };
1637 }
1638 if (type == "ref") {
1639 if (!j.contains("name") || !j["name"].is_string()) {
1640 throw std::runtime_error("ref parser missing or invalid 'name' field");
1641 }
1642 return common_peg_ref_parser{j["name"]};
1643 }
1644 if (type == "atomic") {
1645 if (!j.contains("child")) {
1646 throw std::runtime_error("tag parser missing required fields");
1647 }
1648 return common_peg_atomic_parser{
1649 j["child"].get<common_peg_parser_id>(),
1650 };
1651 }
1652 if (type == "tag") {
1653 if (!j.contains("child") || !j.contains("tag")) {
1654 throw std::runtime_error("tag parser missing required fields");
1655 }
1656 return common_peg_tag_parser{
1657 j["child"].get<common_peg_parser_id>(),
1658 j["tag"].get<std::string>(),
1659 };
1660 }
1661
1662 throw std::runtime_error("Unknown parser type: " + type);
1663}
1664
1665common_peg_arena common_peg_arena::from_json(const nlohmann::json & j) {
1666 if (!j.contains("parsers") || !j["parsers"].is_array()) {
1667 throw std::runtime_error("JSON missing or invalid 'parsers' array");
1668 }
1669 if (!j.contains("rules") || !j["rules"].is_object()) {
1670 throw std::runtime_error("JSON missing or invalid 'rules' object");
1671 }
1672 if (!j.contains("root")) {
1673 throw std::runtime_error("JSON missing 'root' field");
1674 }
1675
1676 common_peg_arena arena;
1677
1678 const auto & parsers_json = j["parsers"];
1679 arena.parsers_.reserve(parsers_json.size());
1680 for (const auto & parser_json : parsers_json) {
1681 arena.parsers_.push_back(deserialize_parser_variant(parser_json));
1682 }
1683
1684 arena.rules_ = j["rules"].get<std::unordered_map<std::string, common_peg_parser_id>>();
1685
1686 for (const auto & [name, id] : arena.rules_) {
1687 if (id >= arena.parsers_.size()) {
1688 throw std::runtime_error("Rule '" + name + "' references invalid parser ID: " + std::to_string(id));
1689 }
1690 }
1691
1692 arena.root_ = j["root"].get<common_peg_parser_id>();
1693 if (arena.root_ != COMMON_PEG_INVALID_PARSER_ID && arena.root_ >= arena.parsers_.size()) {
1694 throw std::runtime_error("Root references invalid parser ID: " + std::to_string(arena.root_));
1695 }
1696
1697 return arena;
1698}
1699
1700std::string common_peg_arena::save() const {
1701 return to_json().dump();
1702}
1703
1704void common_peg_arena::load(const std::string & data) {
1705 *this = from_json(nlohmann::json::parse(data));
1706}
1707
1708common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
1709 common_peg_parser_builder builder;
1710 builder.set_root(fn(builder));
1711 return builder.build();
1712}