1#include "common.h"
   2#include "peg-parser.h"
   3#include "json-schema-to-grammar.h"
   4#include "unicode.h"
   5
   6#include <nlohmann/json.hpp>
   7
   8#include <algorithm>
   9#include <initializer_list>
  10#include <map>
  11#include <memory>
  12#include <regex>
  13#include <stdexcept>
  14#include <unordered_set>
  15
  16// Trick to catch missing branches
  17template <typename T>
  18inline constexpr bool is_always_false_v = false;
  19
  20const char * common_peg_parse_result_type_name(common_peg_parse_result_type type) {
  21    switch (type) {
  22        case COMMON_PEG_PARSE_RESULT_FAIL:            return "fail";
  23        case COMMON_PEG_PARSE_RESULT_SUCCESS:         return "success";
  24        case COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT: return "need_more_input";
  25        default:                                      return "unknown";
  26    }
  27}
  28
  29static bool is_hex_digit(const char c) {
  30    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
  31}
  32
  33// Trie for matching multiple literals.
  34// This is used in common_peg_until_parser and to build a GBNF exclusion grammar
  35struct trie {
  36    struct node {
  37        size_t depth = 0;
  38        std::map<unsigned char, size_t> children;
  39        bool is_word;
  40    };
  41
  42    std::vector<node> nodes;
  43
  44    trie(const std::vector<std::string> & words) {
  45      create_node(); // root node
  46      for (const auto & w : words) {
  47          insert(w);
  48      }
  49    }
  50
  51    enum match_result { NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH };
  52
  53    // Check if a delimiter starts at the given position
  54    match_result check_at(std::string_view sv, size_t start_pos) const {
  55        size_t current = 0; // Start at root
  56        size_t pos = start_pos;
  57
  58        while (pos < sv.size()) {
  59            auto it = nodes[current].children.find(sv[pos]);
  60            if (it == nodes[current].children.end()) {
  61                // Can't continue matching
  62                return match_result{match_result::NO_MATCH};
  63            }
  64
  65            current = it->second;
  66            pos++;
  67
  68            // Check if we've matched a complete word
  69            if (nodes[current].is_word) {
  70                return match_result{match_result::COMPLETE_MATCH};
  71            }
  72        }
  73
  74        // Reached end of input while still in the trie (not at root)
  75        if (current != 0) {
  76            // We're in the middle of a potential match
  77            return match_result{match_result::PARTIAL_MATCH};
  78        }
  79
  80        // Reached end at root (no match)
  81        return match_result{match_result::NO_MATCH};
  82    }
  83
  84    struct prefix_and_next {
  85        std::string prefix;
  86        std::string next_chars;
  87    };
  88
  89    std::vector<prefix_and_next> collect_prefix_and_next() {
  90        std::string prefix;
  91        std::vector<prefix_and_next> result;
  92        collect_prefix_and_next(0, prefix, result);
  93        return result;
  94    }
  95
  96  private:
  97    void collect_prefix_and_next(size_t index, std::string & prefix, std::vector<prefix_and_next> & out) {
  98        if (!nodes[index].is_word) {
  99            if (!nodes[index].children.empty()) {
 100                std::string chars;
 101                chars.reserve(nodes[index].children.size());
 102                for (const auto & p : nodes[index].children) {
 103                    chars.push_back(p.first);
 104                }
 105                out.emplace_back(prefix_and_next{prefix, chars});
 106            }
 107        }
 108
 109        for (const auto & p : nodes[index].children) {
 110            unsigned char ch = p.first;
 111            auto child = p.second;
 112            prefix.push_back(ch);
 113            collect_prefix_and_next(child, prefix, out);
 114            prefix.pop_back();
 115        }
 116    }
 117
 118    size_t create_node() {
 119        size_t index = nodes.size();
 120        nodes.emplace_back();
 121        return index;
 122    }
 123
 124    void insert(const std::string & word) {
 125        size_t current = 0;
 126        for (unsigned char ch : word) {
 127            auto it = nodes[current].children.find(ch);
 128            if (it == nodes[current].children.end()) {
 129                size_t child = create_node();
 130                nodes[child].depth = nodes[current].depth + 1;
 131                nodes[current].children[ch] = child;
 132                current = child;
 133            } else {
 134                current = it->second;
 135            }
 136        }
 137        nodes[current].is_word = true;
 138    }
 139};
 140
 141static std::pair<uint32_t, size_t> parse_hex_escape(const std::string & str, size_t pos, int hex_count) {
 142    if (pos + hex_count > str.length()) {
 143        return {0, 0};
 144    }
 145
 146    uint32_t value = 0;
 147    for (int i = 0; i < hex_count; i++) {
 148        char c = str[pos + i];
 149        if (!is_hex_digit(c)) {
 150            return {0, 0};
 151        }
 152        value <<= 4;
 153        if ('a' <= c && c <= 'f') {
 154            value += c - 'a' + 10;
 155        } else if ('A' <= c && c <= 'F') {
 156            value += c - 'A' + 10;
 157        } else if ('0' <= c && c <= '9') {
 158            value += c - '0';
 159        } else {
 160            break;
 161        }
 162    }
 163    return {value, static_cast<size_t>(hex_count)};
 164}
 165
 166static std::pair<uint32_t, size_t> parse_char_class_char(const std::string & content, size_t pos) {
 167    if (content[pos] == '\\' && pos + 1 < content.length()) {
 168        switch (content[pos + 1]) {
 169            case 'x': {
 170                auto result = parse_hex_escape(content, pos + 2, 2);
 171                if (result.second > 0) {
 172                    return {result.first, 2 + result.second};
 173                }
 174                // Invalid escape, treat as literal 'x'
 175                return {static_cast<uint32_t>('x'), 2};
 176            }
 177            case 'u': {
 178                auto result = parse_hex_escape(content, pos + 2, 4);
 179                if (result.second > 0) {
 180                    return {result.first, 2 + result.second};
 181                }
 182                // Invalid escape, treat as literal 'u'
 183                return {static_cast<uint32_t>('u'), 2};
 184            }
 185            case 'U': {
 186                auto result = parse_hex_escape(content, pos + 2, 8);
 187                if (result.second > 0) {
 188                    return {result.first, 2 + result.second};
 189                }
 190                // Invalid escape, treat as literal 'U'
 191                return {static_cast<uint32_t>('U'), 2};
 192            }
 193            case 'n':  return {'\n', 2};
 194            case 't':  return {'\t', 2};
 195            case 'r':  return {'\r', 2};
 196            case '\\': return {'\\', 2};
 197            case ']':  return {']', 2};
 198            case '[':  return {'[', 2};
 199            default:   return {static_cast<uint32_t>(content[pos + 1]), 2};
 200        }
 201    }
 202
 203    // Regular character - return as codepoint
 204    return {static_cast<uint32_t>(static_cast<unsigned char>(content[pos])), 1};
 205}
 206
 207static std::pair<std::vector<common_peg_chars_parser::char_range>, bool> parse_char_classes(const std::string & classes) {
 208    std::vector<common_peg_chars_parser::char_range> ranges;
 209    bool negated = false;
 210
 211    std::string content = classes;
 212    if (content.front() == '[') {
 213        content = content.substr(1);
 214    }
 215
 216    if (content.back() == ']') {
 217        content.pop_back();
 218    }
 219
 220    // Check for negation
 221    if (!content.empty() && content.front() == '^') {
 222        negated = true;
 223        content = content.substr(1);
 224    }
 225
 226    size_t i = 0;
 227    while (i < content.length()) {
 228        auto [start, start_len] = parse_char_class_char(content, i);
 229        i += start_len;
 230
 231        if (i + 1 < content.length() && content[i] == '-') {
 232            // Range detected
 233            auto [end, end_len] = parse_char_class_char(content, i + 1);
 234            ranges.push_back(common_peg_chars_parser::char_range{start, end});
 235            i += 1 + end_len;
 236        } else {
 237            ranges.push_back(common_peg_chars_parser::char_range{start, start});
 238        }
 239    }
 240
 241    return {ranges, negated};
 242}
 243
 244void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const {
 245    if (id == COMMON_PEG_INVALID_AST_ID) {
 246        return;
 247    }
 248    const auto & node = get(id);
 249    visitor(node);
 250    for (const auto & child : node.children) {
 251        visit(child, visitor);
 252    }
 253}
 254
 255void common_peg_ast_arena::visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const {
 256    for (const auto & node : result.nodes) {
 257        visit(node, visitor);
 258    }
 259}
 260
 261struct parser_executor;
 262
 263common_peg_parser_id common_peg_arena::add_parser(common_peg_parser_variant parser) {
 264    common_peg_parser_id id = parsers_.size();
 265    parsers_.push_back(std::move(parser));
 266    return id;
 267}
 268
 269void common_peg_arena::add_rule(const std::string & name, common_peg_parser_id id) {
 270    rules_[name] = id;
 271}
 272
 273common_peg_parser_id common_peg_arena::get_rule(const std::string & name) const {
 274    auto it = rules_.find(name);
 275    if (it == rules_.end()) {
 276        throw std::runtime_error("Rule not found: " + name);
 277    }
 278    return it->second;
 279}
 280
 281struct parser_executor {
 282    const common_peg_arena & arena;
 283    common_peg_parse_context & ctx;
 284    size_t start_pos;
 285
 286    parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start)
 287        : arena(arena), ctx(ctx), start_pos(start) {}
 288
 289    common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const {
 290        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
 291    }
 292
 293    common_peg_parse_result operator()(const common_peg_start_parser & /* p */) const {
 294        return common_peg_parse_result(
 295            start_pos == 0 ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
 296            start_pos
 297        );
 298    }
 299
 300    common_peg_parse_result operator()(const common_peg_end_parser & /* p */) const {
 301        return common_peg_parse_result(
 302            start_pos >= ctx.input.size() ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL,
 303            start_pos
 304        );
 305    }
 306
 307    common_peg_parse_result operator()(const common_peg_literal_parser & p) {
 308        auto pos = start_pos;
 309        for (auto i = 0u; i < p.literal.size(); ++i) {
 310            if (pos >= ctx.input.size()) {
 311                if (!ctx.is_partial) {
 312                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 313                }
 314                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
 315            }
 316            if (ctx.input[pos] != p.literal[i]) {
 317                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 318            }
 319            ++pos;
 320        }
 321
 322        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 323    }
 324
 325    common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
 326        auto pos = start_pos;
 327        std::vector<common_peg_ast_id> nodes;
 328
 329        for (const auto & child_id : p.children) {
 330            auto result = arena.parse(child_id, ctx, pos);
 331            if (result.fail()) {
 332                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
 333            }
 334
 335            if (!result.nodes.empty()) {
 336                nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
 337            }
 338
 339            if (result.need_more_input()) {
 340                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
 341            }
 342
 343            pos = result.end;
 344        }
 345
 346        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
 347    }
 348
 349    common_peg_parse_result operator()(const common_peg_choice_parser & p) {
 350        auto pos = start_pos;
 351        for (const auto & child_id : p.children) {
 352            auto result = arena.parse(child_id, ctx, pos);
 353            if (!result.fail()) {
 354                return result;
 355            }
 356        }
 357
 358        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 359    }
 360
 361    common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
 362        auto pos = start_pos;
 363        int match_count = 0;
 364        std::vector<common_peg_ast_id> nodes;
 365
 366        // Try to match up to max_count times (or unlimited if max_count is -1)
 367        while (p.max_count == -1 || match_count < p.max_count) {
 368            if (pos >= ctx.input.size()) {
 369                break;
 370            }
 371
 372            auto result = arena.parse(p.child, ctx, pos);
 373
 374            if (result.success()) {
 375                // Prevent infinite loop on empty matches
 376                if (result.end == pos) {
 377                    break;
 378                }
 379
 380                if (!result.nodes.empty()) {
 381                    nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
 382                }
 383
 384                pos = result.end;
 385                match_count++;
 386                continue;
 387            }
 388
 389            if (result.need_more_input()) {
 390                if (!result.nodes.empty()) {
 391                    nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end());
 392                }
 393
 394                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
 395            }
 396
 397            // Child failed - stop trying
 398            break;
 399        }
 400
 401        // Check if we got enough matches
 402        if (p.min_count > 0 && match_count < p.min_count) {
 403            if (pos >= ctx.input.size() && ctx.is_partial) {
 404                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
 405            }
 406            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
 407        }
 408
 409        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
 410    }
 411
 412    common_peg_parse_result operator()(const common_peg_and_parser & p) {
 413        auto result = arena.parse(p.child, ctx, start_pos);
 414        // Pass result but don't consume input
 415        return common_peg_parse_result(result.type, start_pos);
 416    }
 417
 418    common_peg_parse_result operator()(const common_peg_not_parser & p) {
 419        auto result = arena.parse(p.child, ctx, start_pos);
 420
 421        if (result.success()) {
 422            // Fail if the underlying parser matches
 423            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 424        }
 425
 426        if (result.need_more_input()) {
 427            // Propagate - need to know what child would match before negating
 428            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
 429        }
 430
 431        // Child failed, so negation succeeds
 432        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos);
 433    }
 434
 435    common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const {
 436        // Parse a single UTF-8 codepoint (not just a single byte)
 437        auto result = parse_utf8_codepoint(ctx.input, start_pos);
 438
 439        if (result.status == utf8_parse_result::INCOMPLETE) {
 440            if (!ctx.is_partial) {
 441                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 442            }
 443            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
 444        }
 445        if (result.status == utf8_parse_result::INVALID) {
 446            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 447        }
 448        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, start_pos + result.bytes_consumed);
 449    }
 450
 451    common_peg_parse_result operator()(const common_peg_space_parser & /* p */) {
 452        auto pos = start_pos;
 453        while (pos < ctx.input.size()) {
 454            auto c = static_cast<unsigned char>(ctx.input[pos]);
 455            if (std::isspace(c)) {
 456                ++pos;
 457            } else {
 458                break;
 459            }
 460        }
 461
 462        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 463    }
 464
 465    common_peg_parse_result operator()(const common_peg_chars_parser & p) const {
 466        auto pos = start_pos;
 467        int match_count = 0;
 468
 469        // Try to match up to max_count times (or unlimited if max_count is -1)
 470        while (p.max_count == -1 || match_count < p.max_count) {
 471            auto result = parse_utf8_codepoint(ctx.input, pos);
 472
 473            if (result.status == utf8_parse_result::INCOMPLETE) {
 474                if (match_count >= p.min_count) {
 475                    // We have enough matches, succeed with what we have
 476                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 477                }
 478                // Not enough matches yet
 479                if (!ctx.is_partial) {
 480                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 481                }
 482                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
 483            }
 484
 485            if (result.status == utf8_parse_result::INVALID) {
 486                // Malformed UTF-8 in input
 487                if (match_count >= p.min_count) {
 488                    // We have enough matches, succeed up to here
 489                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 490                }
 491                // Not enough matches, fail
 492                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 493            }
 494
 495            // Check if this codepoint matches our character class
 496            bool matches = false;
 497            for (const auto & range : p.ranges) {
 498                if (range.contains(result.codepoint)) {
 499                    matches = true;
 500                    break;
 501                }
 502            }
 503
 504            // If negated, invert the match result
 505            if (p.negated) {
 506                matches = !matches;
 507            }
 508
 509            if (matches) {
 510                pos += result.bytes_consumed;
 511                ++match_count;
 512            } else {
 513                // Character doesn't match, stop matching
 514                break;
 515            }
 516        }
 517
 518        // Check if we got enough matches
 519        if (match_count < p.min_count) {
 520            if (pos >= ctx.input.size() && ctx.is_partial) {
 521                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
 522            }
 523            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
 524        }
 525
 526        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 527    }
 528
 529    static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
 530        ++pos; // consume '\'
 531        if (pos >= ctx.input.size()) {
 532            if (!ctx.is_partial) {
 533                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
 534            }
 535            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
 536        }
 537
 538        switch (ctx.input[pos]) {
 539            case '"':
 540            case '\\':
 541            case '/':
 542            case 'b':
 543            case 'f':
 544            case 'n':
 545            case 'r':
 546            case 't':
 547                ++pos;
 548                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
 549            case 'u':
 550                return handle_unicode_escape(ctx, start, pos);
 551            default:
 552                // Invalid escape sequence
 553                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
 554        }
 555    }
 556
 557    static common_peg_parse_result handle_unicode_escape(common_peg_parse_context & ctx, size_t start, size_t & pos) {
 558        ++pos; // consume 'u'
 559        for (int i = 0; i < 4; ++i) {
 560            if (pos >= ctx.input.size()) {
 561                if (!ctx.is_partial) {
 562                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
 563                }
 564                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
 565            }
 566            if (!is_hex_digit(ctx.input[pos])) {
 567                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
 568            }
 569            ++pos;
 570        }
 571        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos);
 572    }
 573
 574    common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) {
 575        auto pos = start_pos;
 576
 577        // Parse string content (without quotes)
 578        while (pos < ctx.input.size()) {
 579            char c = ctx.input[pos];
 580
 581            if (c == '"') {
 582                // Found closing quote - success (don't consume it)
 583                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 584            }
 585
 586            if (c == '\\') {
 587                auto result = handle_escape_sequence(ctx, start_pos, pos);
 588                if (!result.success()) {
 589                    return result;
 590                }
 591            } else {
 592                auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
 593
 594                if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
 595                    if (!ctx.is_partial) {
 596                        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 597                    }
 598                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
 599                }
 600
 601                if (utf8_result.status == utf8_parse_result::INVALID) {
 602                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 603                }
 604
 605                pos += utf8_result.bytes_consumed;
 606            }
 607        }
 608
 609        // Reached end without finding closing quote
 610        if (!ctx.is_partial) {
 611            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
 612        }
 613        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
 614    }
 615
 616    common_peg_parse_result operator()(const common_peg_until_parser & p) const {
 617        trie matcher(p.delimiters);
 618
 619        // Scan input and check for delimiters
 620        size_t pos = start_pos;
 621        size_t last_valid_pos = start_pos;
 622
 623        while (pos < ctx.input.size()) {
 624            auto utf8_result = parse_utf8_codepoint(ctx.input, pos);
 625
 626            if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
 627                // Incomplete UTF-8 sequence
 628                if (!ctx.is_partial) {
 629                    // Input is complete but UTF-8 is incomplete = malformed
 630                    return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 631                }
 632                // Return what we have so far (before incomplete sequence)
 633                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
 634            }
 635
 636            if (utf8_result.status == utf8_parse_result::INVALID) {
 637                // Malformed UTF-8
 638                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
 639            }
 640
 641            // Check if a delimiter starts at this position
 642            auto match = matcher.check_at(ctx.input, pos);
 643
 644            if (match == trie::COMPLETE_MATCH) {
 645                // Found a complete delimiter, return everything before it
 646                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 647            }
 648
 649            if (match == trie::PARTIAL_MATCH) {
 650                // Found a partial match extending to end of input, return everything before it
 651                return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
 652            }
 653
 654            pos += utf8_result.bytes_consumed;
 655            last_valid_pos = pos;
 656        }
 657
 658        if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
 659            // Reached the end of a partial stream, there might still be more input that we need to consume.
 660            return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
 661        }
 662        return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, last_valid_pos);
 663    }
 664
 665    common_peg_parse_result operator()(const common_peg_schema_parser & p) {
 666        return arena.parse(p.child, ctx, start_pos);
 667    }
 668
 669    common_peg_parse_result operator()(const common_peg_rule_parser & p) {
 670        // Parse the child
 671        auto result = arena.parse(p.child, ctx, start_pos);
 672
 673        if (!result.fail()) {
 674            std::string_view text;
 675            if (result.start < ctx.input.size()) {
 676                text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
 677            }
 678
 679            auto node_id = ctx.ast.add_node(
 680                p.name,
 681                "",
 682                result.start,
 683                result.end,
 684                text,
 685                std::move(result.nodes),
 686                result.need_more_input()
 687            );
 688
 689            return common_peg_parse_result(result.type, result.start, result.end, { node_id });
 690        }
 691
 692        return result;
 693    }
 694
 695    common_peg_parse_result operator()(const common_peg_tag_parser & p) {
 696        // Parse the child
 697        auto result = arena.parse(p.child, ctx, start_pos);
 698
 699        if (!result.fail()) {
 700            std::string_view text;
 701            if (result.start < ctx.input.size()) {
 702                text = std::string_view(ctx.input).substr(result.start, result.end - result.start);
 703            }
 704
 705            auto node_id = ctx.ast.add_node(
 706                "",
 707                p.tag,
 708                result.start,
 709                result.end,
 710                text,
 711                std::move(result.nodes),
 712                result.need_more_input()
 713            );
 714
 715            return common_peg_parse_result(result.type, result.start, result.end, { node_id });
 716        }
 717
 718        return result;
 719    }
 720
 721    common_peg_parse_result operator()(const common_peg_ref_parser & p) {
 722        auto rule_id = arena.get_rule(p.name);
 723        return arena.parse(rule_id, ctx, start_pos);
 724    }
 725
 726    common_peg_parse_result operator()(const common_peg_atomic_parser & p) {
 727        auto result = arena.parse(p.child, ctx, start_pos);
 728        if (result.need_more_input()) {
 729            // Clear nodes so they don't propagate up.
 730            result.nodes.clear();
 731        }
 732        return result;
 733    }
 734};
 735
 736common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const {
 737    if (root_ == COMMON_PEG_INVALID_PARSER_ID) {
 738        throw std::runtime_error("No root parser set");
 739    }
 740    return parse(root_, ctx, start);
 741}
 742
 743common_peg_parse_result common_peg_arena::parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const {
 744    // Execute parser
 745    const auto & parser = parsers_.at(id);
 746    parser_executor exec(*this, ctx, start);
 747    return std::visit(exec, parser);
 748}
 749
 750common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) {
 751    const auto & parser = parsers_.at(id);
 752    if (auto ref = std::get_if<common_peg_ref_parser>(&parser)) {
 753        return get_rule(ref->name);
 754    }
 755    return id;
 756}
 757
 758void common_peg_arena::resolve_refs() {
 759    // Walk through all parsers and replace refs with their corresponding rule IDs
 760    for (auto & parser : parsers_) {
 761        std::visit([this](auto & p) {
 762            using T = std::decay_t<decltype(p)>;
 763
 764            if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
 765                for (auto & child : p.children) {
 766                    child = resolve_ref(child);
 767                }
 768            } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
 769                for (auto & child : p.children) {
 770                    child = resolve_ref(child);
 771                }
 772            } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
 773                                 std::is_same_v<T, common_peg_and_parser> ||
 774                                 std::is_same_v<T, common_peg_not_parser> ||
 775                                 std::is_same_v<T, common_peg_tag_parser> ||
 776                                 std::is_same_v<T, common_peg_atomic_parser>) {
 777                p.child = resolve_ref(p.child);
 778            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
 779                p.child = resolve_ref(p.child);
 780            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
 781                p.child = resolve_ref(p.child);
 782            } else if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
 783                                 std::is_same_v<T, common_peg_start_parser> ||
 784                                 std::is_same_v<T, common_peg_end_parser> ||
 785                                 std::is_same_v<T, common_peg_ref_parser> ||
 786                                 std::is_same_v<T, common_peg_until_parser> ||
 787                                 std::is_same_v<T, common_peg_literal_parser> ||
 788                                 std::is_same_v<T, common_peg_json_string_parser> ||
 789                                 std::is_same_v<T, common_peg_chars_parser> ||
 790                                 std::is_same_v<T, common_peg_any_parser> ||
 791                                 std::is_same_v<T, common_peg_space_parser>) {
 792                // These rules do not have children
 793            } else {
 794                static_assert(is_always_false_v<T>);
 795            }
 796        }, parser);
 797    }
 798
 799    // Also flatten root if it's a ref
 800    if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
 801        root_ = resolve_ref(root_);
 802    }
 803}
 804
 805std::string common_peg_arena::dump(common_peg_parser_id id) const {
 806    const auto & parser = parsers_.at(id);
 807
 808    return std::visit([this](const auto & p) -> std::string {
 809        using T = std::decay_t<decltype(p)>;
 810
 811        if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
 812            return "Epsilon";
 813        } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
 814            return "Start";
 815        } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
 816            return "End";
 817        } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
 818            return "Literal(" + p.literal + ")";
 819        } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
 820            std::vector<std::string> parts;
 821            for (const auto & child : p.children) {
 822                parts.push_back(dump(child));
 823            }
 824            return "Sequence(" + string_join(parts, ", ") + ")";
 825        } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
 826            std::vector<std::string> parts;
 827            for (const auto & child : p.children) {
 828                parts.push_back(dump(child));
 829            }
 830            return "Choice(" + string_join(parts, ", ") + ")";
 831        } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
 832            if (p.max_count == -1) {
 833                return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)";
 834            }
 835            return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
 836        } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
 837            return "And(" + dump(p.child) + ")";
 838        } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
 839            return "Not(" + dump(p.child) + ")";
 840        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
 841            return "Any";
 842        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
 843            return "Space";
 844        } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
 845            if (p.max_count == -1) {
 846                return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)";
 847            }
 848            return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")";
 849        } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
 850            return "JsonString()";
 851        } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
 852            return "Until(" + string_join(p.delimiters, " | ") + ")";
 853        } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
 854            return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")";
 855        } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
 856            return "Rule(" + p.name + ", " + dump(p.child) + ")";
 857        } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
 858            return "Ref(" + p.name + ")";
 859        } else {
 860            return "Unknown";
 861        }
 862    }, parser);
 863}
 864
 865common_peg_parser & common_peg_parser::operator=(const common_peg_parser & other) {
 866    id_ = other.id_;
 867    return *this;
 868}
 869
 870common_peg_parser & common_peg_parser::operator+=(const common_peg_parser & other) {
 871    id_ = builder_.sequence({id_, other.id_});
 872    return *this;
 873}
 874
 875common_peg_parser & common_peg_parser::operator|=(const common_peg_parser & other) {
 876    id_ = builder_.choice({id_, other.id_});
 877    return *this;
 878}
 879
 880common_peg_parser common_peg_parser::operator+(const common_peg_parser & other) const {
 881    return builder_.sequence({id_, other.id_});
 882}
 883
 884common_peg_parser common_peg_parser::operator|(const common_peg_parser & other) const {
 885    return builder_.choice({id_, other.id_});
 886}
 887
 888common_peg_parser common_peg_parser::operator<<(const common_peg_parser & other) const {
 889    return builder_.sequence({id_, builder_.space(), other.id_});
 890}
 891
 892common_peg_parser common_peg_parser::operator+(const char * str) const {
 893    return *this + builder_.literal(str);
 894}
 895
 896common_peg_parser common_peg_parser::operator+(const std::string & str) const {
 897    return *this + builder_.literal(str);
 898}
 899
 900common_peg_parser common_peg_parser::operator<<(const char * str) const {
 901    return *this << builder_.literal(str);
 902}
 903
 904common_peg_parser common_peg_parser::operator<<(const std::string & str) const {
 905    return *this << builder_.literal(str);
 906}
 907
 908common_peg_parser common_peg_parser::operator|(const char * str) const {
 909    return *this | builder_.literal(str);
 910}
 911
 912common_peg_parser common_peg_parser::operator|(const std::string & str) const {
 913    return *this | builder_.literal(str);
 914}
 915
 916common_peg_parser operator+(const char * str, const common_peg_parser & p) {
 917    return p.builder().literal(str) + p;
 918}
 919
 920common_peg_parser operator+(const std::string & str, const common_peg_parser & p) {
 921    return operator+(str.c_str(), p);
 922}
 923
 924common_peg_parser operator<<(const char * str, const common_peg_parser & p) {
 925    return p.builder().literal(str) << p;
 926}
 927
 928common_peg_parser operator<<(const std::string & str, const common_peg_parser & p) {
 929    return operator<<(str.c_str(), p);
 930}
 931
 932common_peg_parser operator|(const char * str, const common_peg_parser & p) {
 933    return p.builder().literal(str) | p;
 934}
 935
 936common_peg_parser operator|(const std::string & str, const common_peg_parser & p) {
 937    return operator|(str.c_str(), p);
 938}
 939
 940static std::string rule_name(const std::string & name) {
 941    static const std::regex invalid_rule_chars_re("[^a-zA-Z0-9-]+");
 942    return std::regex_replace(name, invalid_rule_chars_re, "-");
 943}
 944
 945common_peg_parser_builder::common_peg_parser_builder() {}
 946
 947common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser_id> & parsers) {
 948    // Flatten nested sequences
 949    std::vector<common_peg_parser_id> flattened;
 950    for (const auto & p : parsers) {
 951        const auto & parser = arena_.get(p);
 952        if (auto seq = std::get_if<common_peg_sequence_parser>(&parser)) {
 953            flattened.insert(flattened.end(), seq->children.begin(), seq->children.end());
 954        } else {
 955            flattened.push_back(p);
 956        }
 957    }
 958    return wrap(arena_.add_parser(common_peg_sequence_parser{flattened}));
 959}
 960
 961common_peg_parser common_peg_parser_builder::sequence(const std::vector<common_peg_parser> & parsers) {
 962    std::vector<common_peg_parser_id> ids;
 963    ids.reserve(parsers.size());
 964    for (const auto & p : parsers) {
 965        ids.push_back(p.id());
 966    }
 967    return sequence(ids);
 968}
 969
 970common_peg_parser common_peg_parser_builder::sequence(std::initializer_list<common_peg_parser> parsers) {
 971    std::vector<common_peg_parser_id> ids;
 972    ids.reserve(parsers.size());
 973    for (const auto & p : parsers) {
 974        ids.push_back(p.id());
 975    }
 976    return sequence(ids);
 977}
 978
 979common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser_id> & parsers) {
 980    // Flatten nested choices
 981    std::vector<common_peg_parser_id> flattened;
 982    for (const auto & p : parsers) {
 983        const auto & parser = arena_.get(p);
 984        if (auto choice = std::get_if<common_peg_choice_parser>(&parser)) {
 985            flattened.insert(flattened.end(), choice->children.begin(), choice->children.end());
 986        } else {
 987            flattened.push_back(p);
 988        }
 989    }
 990    return wrap(arena_.add_parser(common_peg_choice_parser{flattened}));
 991}
 992
 993common_peg_parser common_peg_parser_builder::choice(const std::vector<common_peg_parser> & parsers) {
 994    std::vector<common_peg_parser_id> ids;
 995    ids.reserve(parsers.size());
 996    for (const auto & p : parsers) {
 997        ids.push_back(p.id());
 998    }
 999    return choice(ids);
1000}
1001
1002common_peg_parser common_peg_parser_builder::choice(std::initializer_list<common_peg_parser> parsers) {
1003    std::vector<common_peg_parser_id> ids;
1004    ids.reserve(parsers.size());
1005    for (const auto & p : parsers) {
1006        ids.push_back(p.id());
1007    }
1008    return choice(ids);
1009}
1010
1011common_peg_parser common_peg_parser_builder::chars(const std::string & classes, int min, int max) {
1012    auto [ranges, negated] = parse_char_classes(classes);
1013    return wrap(arena_.add_parser(common_peg_chars_parser{classes, ranges, negated, min, max}));
1014}
1015
1016common_peg_parser common_peg_parser_builder::schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw) {
1017    return wrap(arena_.add_parser(common_peg_schema_parser{p.id(), name, std::make_shared<nlohmann::ordered_json>(schema), raw}));
1018}
1019
1020common_peg_parser common_peg_parser_builder::rule(const std::string & name, const common_peg_parser & p, bool trigger) {
1021    auto clean_name = rule_name(name);
1022    auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, p.id(), trigger});
1023    arena_.add_rule(clean_name, rule_id);
1024    return ref(clean_name);
1025}
1026
1027common_peg_parser common_peg_parser_builder::rule(const std::string & name, const std::function<common_peg_parser()> & builder_fn, bool trigger) {
1028    auto clean_name = rule_name(name);
1029    if (arena_.has_rule(clean_name)) {
1030        return ref(clean_name);
1031    }
1032
1033    // Create placeholder rule to allow recursive references
1034    auto placeholder = any();  // Temporary placeholder
1035    auto placeholder_rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, placeholder.id(), trigger});
1036    arena_.add_rule(clean_name, placeholder_rule_id);
1037
1038    // Build the actual parser
1039    auto parser = builder_fn();
1040
1041    // Replace placeholder with actual rule
1042    auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, parser.id(), trigger});
1043    arena_.rules_[clean_name] = rule_id;
1044
1045    return ref(clean_name);
1046}
1047
1048void common_peg_parser_builder::set_root(const common_peg_parser & p) {
1049    arena_.set_root(p.id());
1050}
1051
1052common_peg_arena common_peg_parser_builder::build() {
1053    arena_.resolve_refs();
1054    return std::move(arena_);
1055}
1056
1057// JSON parsers
1058common_peg_parser common_peg_parser_builder::json_number() {
1059   return rule("json-number", [this]() {
1060        auto digit1_9 = chars("[1-9]", 1, 1);
1061        auto digits = chars("[0-9]");
1062        auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})});
1063        auto frac = sequence({literal("."), digits});
1064        auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits});
1065        return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()});
1066    });
1067}
1068
1069common_peg_parser common_peg_parser_builder::json_string() {
1070    return rule("json-string", [this]() {
1071        return sequence({literal("\""), json_string_content(), literal("\""), space()});
1072    });
1073}
1074
1075common_peg_parser common_peg_parser_builder::json_bool() {
1076    return rule("json-bool", [this]() {
1077        return sequence({choice({literal("true"), literal("false")}), space()});
1078    });
1079}
1080
1081common_peg_parser common_peg_parser_builder::json_null() {
1082    return rule("json-null", [this]() {
1083        return sequence({literal("null"), space()});
1084    });
1085}
1086
1087common_peg_parser common_peg_parser_builder::json_object() {
1088    return rule("json-object", [this]() {
1089        auto ws = space();
1090        auto member = sequence({json_string(), ws, literal(":"), ws, json()});
1091        auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))});
1092        return sequence({
1093            literal("{"),
1094            ws,
1095            choice({
1096                literal("}"),
1097                sequence({members, ws, literal("}")})
1098            }),
1099            ws
1100        });
1101    });
1102}
1103
1104common_peg_parser common_peg_parser_builder::json_array() {
1105    return rule("json-array", [this]() {
1106        auto ws = space();
1107        auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))});
1108        return sequence({
1109            literal("["),
1110            ws,
1111            choice({
1112                literal("]"),
1113                sequence({elements, ws, literal("]")})
1114            }),
1115            ws
1116        });
1117    });
1118}
1119
1120common_peg_parser common_peg_parser_builder::json() {
1121    return rule("json-value", [this]() {
1122        return choice({
1123            json_object(),
1124            json_array(),
1125            json_string(),
1126            json_number(),
1127            json_bool(),
1128            json_null()
1129        });
1130    });
1131}
1132
1133common_peg_parser common_peg_parser_builder::json_string_content() {
1134    return wrap(arena_.add_parser(common_peg_json_string_parser{}));
1135}
1136
1137common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) {
1138    auto ws = space();
1139    return sequence({
1140        literal("\"" + key + "\""),
1141        ws,
1142        literal(":"),
1143        ws,
1144        p,
1145    });
1146}
1147
1148
1149static std::string gbnf_escape_char_class(char c) {
1150    switch (c) {
1151        case '\n': return "\\n";
1152        case '\t': return "\\t";
1153        case '\r': return "\\r";
1154        case '\\': return "\\\\";
1155        case ']':  return "\\]";
1156        case '[':  return "\\[";
1157        default:   return std::string(1, c);
1158    }
1159}
1160
1161static std::string gbnf_excluding_pattern(const std::vector<std::string> & strings) {
1162    trie matcher(strings);
1163    auto pieces = matcher.collect_prefix_and_next();
1164
1165    std::string pattern;
1166    for (size_t i = 0; i < pieces.size(); ++i) {
1167        if (i > 0) {
1168            pattern += " | ";
1169        }
1170
1171        const auto & pre = pieces[i].prefix;
1172        const auto & chars = pieces[i].next_chars;
1173
1174        std::string cls;
1175        cls.reserve(chars.size());
1176        for (const auto & ch : chars) {
1177            cls += gbnf_escape_char_class(ch);
1178        }
1179
1180        if (!pre.empty()) {
1181            pattern += gbnf_format_literal(pre) + " [^" + cls + "]";
1182        } else {
1183            pattern += "[^" + cls + "]";
1184        }
1185    }
1186
1187    return "(" + pattern + ")*";
1188}
1189
1190static std::unordered_set<std::string> collect_reachable_rules(
1191    const common_peg_arena & arena,
1192    const common_peg_parser_id & rule
1193) {
1194    std::unordered_set<std::string> reachable;
1195    std::unordered_set<std::string> visited;
1196
1197    std::function<void(common_peg_parser_id)> visit = [&](common_peg_parser_id id) {
1198        const auto & parser = arena.get(id);
1199
1200        std::visit([&](const auto & p) {
1201            using T = std::decay_t<decltype(p)>;
1202
1203            if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
1204                          std::is_same_v<T, common_peg_start_parser> ||
1205                          std::is_same_v<T, common_peg_end_parser> ||
1206                          std::is_same_v<T, common_peg_until_parser> ||
1207                          std::is_same_v<T, common_peg_literal_parser> ||
1208                          std::is_same_v<T, common_peg_chars_parser> ||
1209                          std::is_same_v<T, common_peg_space_parser> ||
1210                          std::is_same_v<T, common_peg_any_parser> ||
1211                          std::is_same_v<T, common_peg_json_string_parser>) {
1212                // These parsers do not have any children
1213            } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
1214                for (auto child : p.children) {
1215                    visit(child);
1216                }
1217            } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
1218                for (auto child : p.children) {
1219                    visit(child);
1220                }
1221            } else if constexpr (std::is_same_v<T, common_peg_repetition_parser> ||
1222                                 std::is_same_v<T, common_peg_and_parser> ||
1223                                 std::is_same_v<T, common_peg_not_parser> ||
1224                                 std::is_same_v<T, common_peg_tag_parser> ||
1225                                 std::is_same_v<T, common_peg_atomic_parser> ||
1226                                 std::is_same_v<T, common_peg_schema_parser>) {
1227                visit(p.child);
1228            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
1229                if (visited.find(p.name) == visited.end()) {
1230                    visited.insert(p.name);
1231                    reachable.insert(p.name);
1232                    visit(p.child);
1233                }
1234            } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
1235                // Traverse rules so we pick up everything
1236                auto referenced_rule = arena.get_rule(p.name);
1237                visit(referenced_rule);
1238            } else {
1239                static_assert(is_always_false_v<T>);
1240            }
1241        }, parser);
1242    };
1243
1244    visit(rule);
1245    return reachable;
1246}
1247
1248// GBNF generation implementation
1249void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const {
1250    // Generate GBNF for a parser
1251    std::function<std::string(common_peg_parser_id)> to_gbnf = [&](common_peg_parser_id id) -> std::string {
1252        const auto & parser = parsers_.at(id);
1253
1254        return std::visit([&](const auto & p) -> std::string {
1255            using T = std::decay_t<decltype(p)>;
1256
1257            if constexpr (std::is_same_v<T, common_peg_epsilon_parser> ||
1258                          std::is_same_v<T, common_peg_start_parser> ||
1259                          std::is_same_v<T, common_peg_end_parser>) {
1260                return "";
1261            } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
1262                return gbnf_format_literal(p.literal);
1263            } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
1264                std::string s;
1265                for (const auto & child : p.children) {
1266                    if (!s.empty()) {
1267                        s += " ";
1268                    }
1269                    auto child_gbnf = to_gbnf(child);
1270                    const auto & child_parser = parsers_.at(child);
1271                    if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
1272                        std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
1273                        s += "(" + child_gbnf + ")";
1274                    } else {
1275                        s += child_gbnf;
1276                    }
1277                }
1278                return s;
1279            } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
1280                std::string s;
1281                for (const auto & child : p.children) {
1282                    if (!s.empty()) {
1283                        s += " | ";
1284                    }
1285                    auto child_gbnf = to_gbnf(child);
1286                    const auto & child_parser = parsers_.at(child);
1287                    if (std::holds_alternative<common_peg_choice_parser>(child_parser)) {
1288                        s += "(" + child_gbnf + ")";
1289                    } else {
1290                        s += child_gbnf;
1291                    }
1292                }
1293                return s;
1294            } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
1295                auto child_gbnf = to_gbnf(p.child);
1296                const auto & child_parser = parsers_.at(p.child);
1297                if (std::holds_alternative<common_peg_choice_parser>(child_parser) ||
1298                    std::holds_alternative<common_peg_sequence_parser>(child_parser)) {
1299                    child_gbnf = "(" + child_gbnf + ")";
1300                }
1301                if (p.min_count == 0 && p.max_count == 1) {
1302                    return child_gbnf + "?";
1303                }
1304                if (p.min_count == 0 && p.max_count == -1) {
1305                    return child_gbnf + "*";
1306                }
1307                if (p.min_count == 1 && p.max_count == -1) {
1308                    return child_gbnf + "+";
1309                }
1310                if (p.max_count == -1) {
1311                    return child_gbnf + "{" + std::to_string(p.min_count) + ",}";
1312                }
1313                if (p.min_count == p.max_count) {
1314                    if (p.min_count == 1) {
1315                        return child_gbnf;
1316                    }
1317                    return child_gbnf + "{" + std::to_string(p.min_count) + "}";
1318                }
1319                return child_gbnf + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
1320            } else if constexpr (std::is_same_v<T, common_peg_and_parser> || std::is_same_v<T, common_peg_not_parser>) {
1321                return "";  // Lookahead not supported in GBNF
1322            } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
1323                return ".";
1324            } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
1325                return "space";
1326            } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
1327                std::string result = p.pattern;
1328                if (p.min_count == 0 && p.max_count == 1) {
1329                    return result + "?";
1330                }
1331                if (p.min_count == 0 && p.max_count == -1) {
1332                    return result + "*";
1333                }
1334                if (p.min_count == 1 && p.max_count == -1) {
1335                    return result + "+";
1336                }
1337                if (p.max_count == -1) {
1338                    return result + "{" + std::to_string(p.min_count) + ",}";
1339                }
1340                if (p.min_count == p.max_count) {
1341                    if (p.min_count == 1) {
1342                        return result;
1343                    }
1344                    return result + "{" + std::to_string(p.min_count) + "}";
1345                }
1346                return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}";
1347            } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
1348                return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)";
1349            } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
1350                if (p.delimiters.empty()) {
1351                    return ".*";
1352                }
1353                return gbnf_excluding_pattern(p.delimiters);
1354            } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
1355                if (p.schema) {
1356                    if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") {
1357                        // TODO: Implement more comprehensive grammar generation for raw strings.
1358                        // For now, use the grammar emitted from the underlying parser.
1359                        return to_gbnf(p.child);
1360                    }
1361                    return builder.add_schema(p.name, *p.schema);
1362                }
1363                return to_gbnf(p.child);
1364            } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
1365                return p.name;
1366            } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
1367                // Refs should not exist after flattening, but kept just in case
1368                return p.name;
1369            } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
1370                return to_gbnf(p.child);
1371            } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
1372                return to_gbnf(p.child);
1373            } else {
1374                static_assert(is_always_false_v<T>);
1375            }
1376        }, parser);
1377    };
1378
1379    // Collect reachable rules
1380    std::unordered_set<std::string> reachable_rules;
1381
1382    if (lazy) {
1383        // Collect rules reachable from trigger rules
1384        for (const auto & [name, id] : rules_) {
1385            const auto & parser = parsers_.at(id);
1386            if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
1387                if (rule->trigger) {
1388                    // Mark trigger as reachable and visit it
1389                    reachable_rules.insert(name);
1390                    auto add_rules = collect_reachable_rules(*this, id);
1391                    reachable_rules.insert(add_rules.begin(), add_rules.end());
1392                }
1393            }
1394        }
1395    } else {
1396        // Collect rules reachable from root
1397        reachable_rules = collect_reachable_rules(*this, root_);
1398    }
1399
1400    // Create GBNF rules for all reachable rules
1401    for (const auto & [name, rule_id] : rules_) {
1402        if (reachable_rules.find(name) == reachable_rules.end()) {
1403            continue;
1404        }
1405
1406        const auto & parser = parsers_.at(rule_id);
1407        if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
1408            builder.add_rule(rule->name, to_gbnf(rule->child));
1409        }
1410    }
1411
1412    if (lazy) {
1413        // Generate root rule from trigger rules only
1414        std::vector<std::string> trigger_names;
1415        for (const auto & [name, rule_id] : rules_) {
1416            const auto & parser = parsers_.at(rule_id);
1417            if (auto rule = std::get_if<common_peg_rule_parser>(&parser)) {
1418                if (rule->trigger) {
1419                    trigger_names.push_back(rule->name);
1420                }
1421            }
1422        }
1423
1424        // Sort for predictable order
1425        std::sort(trigger_names.begin(), trigger_names.end());
1426        builder.add_rule("root", string_join(trigger_names, " | "));
1427    } else if (root_ != COMMON_PEG_INVALID_PARSER_ID) {
1428        builder.add_rule("root", to_gbnf(root_));
1429    }
1430}
1431
1432static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & variant) {
1433    using json = nlohmann::json;
1434
1435    return std::visit([](const auto & p) -> json {
1436        using T = std::decay_t<decltype(p)>;
1437
1438        if constexpr (std::is_same_v<T, common_peg_epsilon_parser>) {
1439            return json{{"type", "epsilon"}};
1440        } else if constexpr (std::is_same_v<T, common_peg_start_parser>) {
1441            return json{{"type", "start"}};
1442        } else if constexpr (std::is_same_v<T, common_peg_end_parser>) {
1443            return json{{"type", "end"}};
1444        } else if constexpr (std::is_same_v<T, common_peg_literal_parser>) {
1445            return json{{"type", "literal"}, {"literal", p.literal}};
1446        } else if constexpr (std::is_same_v<T, common_peg_sequence_parser>) {
1447            return json{{"type", "sequence"}, {"children", p.children}};
1448        } else if constexpr (std::is_same_v<T, common_peg_choice_parser>) {
1449            return json{{"type", "choice"}, {"children", p.children}};
1450        } else if constexpr (std::is_same_v<T, common_peg_repetition_parser>) {
1451            return json{
1452                {"type", "repetition"},
1453                {"child", p.child},
1454                {"min_count", p.min_count},
1455                {"max_count", p.max_count}
1456            };
1457        } else if constexpr (std::is_same_v<T, common_peg_and_parser>) {
1458            return json{{"type", "and"}, {"child", p.child}};
1459        } else if constexpr (std::is_same_v<T, common_peg_not_parser>) {
1460            return json{{"type", "not"}, {"child", p.child}};
1461        } else if constexpr (std::is_same_v<T, common_peg_any_parser>) {
1462            return json{{"type", "any"}};
1463        } else if constexpr (std::is_same_v<T, common_peg_space_parser>) {
1464            return json{{"type", "space"}};
1465        } else if constexpr (std::is_same_v<T, common_peg_chars_parser>) {
1466            json ranges = json::array();
1467            for (const auto & range : p.ranges) {
1468                ranges.push_back({{"start", range.start}, {"end", range.end}});
1469            }
1470            return json{
1471                {"type", "chars"},
1472                {"pattern", p.pattern},
1473                {"ranges", ranges},
1474                {"negated", p.negated},
1475                {"min_count", p.min_count},
1476                {"max_count", p.max_count}
1477            };
1478        } else if constexpr (std::is_same_v<T, common_peg_json_string_parser>) {
1479            return json{{"type", "json_string"}};
1480        } else if constexpr (std::is_same_v<T, common_peg_until_parser>) {
1481            return json{{"type", "until"}, {"delimiters", p.delimiters}};
1482        } else if constexpr (std::is_same_v<T, common_peg_schema_parser>) {
1483            return json{
1484                {"type", "schema"},
1485                {"child", p.child},
1486                {"name", p.name},
1487                {"schema", p.schema ? *p.schema : nullptr},
1488                {"raw", p.raw}
1489            };
1490        } else if constexpr (std::is_same_v<T, common_peg_rule_parser>) {
1491            return json{
1492                {"type", "rule"},
1493                {"name", p.name},
1494                {"child", p.child},
1495                {"trigger", p.trigger}
1496            };
1497        } else if constexpr (std::is_same_v<T, common_peg_ref_parser>) {
1498            return json{{"type", "ref"}, {"name", p.name}};
1499        } else if constexpr (std::is_same_v<T, common_peg_atomic_parser>) {
1500            return json{{"type", "atomic"}, {"child", p.child}};
1501        } else if constexpr (std::is_same_v<T, common_peg_tag_parser>) {
1502            return json{
1503                {"type", "tag"},
1504                {"child", p.child},
1505                {"tag", p.tag}
1506            };
1507        }
1508    }, variant);
1509}
1510
1511nlohmann::json common_peg_arena::to_json() const {
1512    auto parsers = nlohmann::json::array();
1513    for (const auto & parser : parsers_) {
1514        parsers.push_back(serialize_parser_variant(parser));
1515    }
1516    return nlohmann::json{
1517        {"parsers", parsers},
1518        {"rules", rules_},
1519        {"root", root_}
1520    };
1521}
1522
1523static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json & j) {
1524    if (!j.contains("type") || !j["type"].is_string()) {
1525        throw std::runtime_error("Parser variant JSON missing or invalid 'type' field");
1526    }
1527
1528    std::string type = j["type"];
1529
1530    if (type == "epsilon") {
1531        return common_peg_epsilon_parser{};
1532    }
1533    if (type == "start") {
1534        return common_peg_start_parser{};
1535    }
1536    if (type == "end") {
1537        return common_peg_end_parser{};
1538    }
1539    if (type == "literal") {
1540        if (!j.contains("literal") || !j["literal"].is_string()) {
1541            throw std::runtime_error("literal parser missing or invalid 'literal' field");
1542        }
1543        return common_peg_literal_parser{j["literal"]};
1544    }
1545    if (type == "sequence") {
1546        if (!j.contains("children") || !j["children"].is_array()) {
1547            throw std::runtime_error("sequence parser missing or invalid 'children' field");
1548        }
1549        return common_peg_sequence_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
1550    }
1551    if (type == "choice") {
1552        if (!j.contains("children") || !j["children"].is_array()) {
1553            throw std::runtime_error("choice parser missing or invalid 'children' field");
1554        }
1555        return common_peg_choice_parser{j["children"].get<std::vector<common_peg_parser_id>>()};
1556    }
1557    if (type == "repetition") {
1558        if (!j.contains("child") || !j.contains("min_count") || !j.contains("max_count")) {
1559            throw std::runtime_error("repetition parser missing required fields");
1560        }
1561        return common_peg_repetition_parser{
1562            j["child"].get<common_peg_parser_id>(),
1563            j["min_count"].get<int>(),
1564            j["max_count"].get<int>()
1565        };
1566    }
1567    if (type == "and") {
1568        if (!j.contains("child")) {
1569            throw std::runtime_error("and parser missing 'child' field");
1570        }
1571        return common_peg_and_parser{j["child"].get<common_peg_parser_id>()};
1572    }
1573    if (type == "not") {
1574        if (!j.contains("child")) {
1575            throw std::runtime_error("not parser missing 'child' field");
1576        }
1577        return common_peg_not_parser{j["child"].get<common_peg_parser_id>()};
1578    }
1579    if (type == "any") {
1580        return common_peg_any_parser{};
1581    }
1582    if (type == "space") {
1583        return common_peg_space_parser{};
1584    }
1585    if (type == "chars") {
1586        if (!j.contains("pattern") || !j.contains("ranges") || !j.contains("negated") ||
1587            !j.contains("min_count") || !j.contains("max_count")) {
1588            throw std::runtime_error("chars parser missing required fields");
1589        }
1590        common_peg_chars_parser parser;
1591        parser.pattern = j["pattern"];
1592        parser.negated = j["negated"];
1593        parser.min_count = j["min_count"];
1594        parser.max_count = j["max_count"];
1595        for (const auto & range_json : j["ranges"]) {
1596            if (!range_json.contains("start") || !range_json.contains("end")) {
1597                throw std::runtime_error("char_range missing 'start' or 'end' field");
1598            }
1599            parser.ranges.push_back({
1600                range_json["start"].get<uint32_t>(),
1601                range_json["end"].get<uint32_t>()
1602            });
1603        }
1604        return parser;
1605    }
1606    if (type == "json_string") {
1607        return common_peg_json_string_parser{};
1608    }
1609    if (type == "until") {
1610        if (!j.contains("delimiters") || !j["delimiters"].is_array()) {
1611            throw std::runtime_error("until parser missing or invalid 'delimiters' field");
1612        }
1613        return common_peg_until_parser{j["delimiters"].get<std::vector<std::string>>()};
1614    }
1615    if (type == "schema") {
1616        if (!j.contains("child") || !j.contains("name") || !j.contains("schema") || !j.contains("raw")) {
1617            throw std::runtime_error("schema parser missing required fields");
1618        }
1619        common_peg_schema_parser parser;
1620        parser.child = j["child"].get<common_peg_parser_id>();
1621        parser.name = j["name"];
1622        if (!j["schema"].is_null()) {
1623            parser.schema = std::make_shared<nlohmann::ordered_json>(j["schema"]);
1624        }
1625        parser.raw = j["raw"].get<bool>();
1626        return parser;
1627    }
1628    if (type == "rule") {
1629        if (!j.contains("name") || !j.contains("child") || !j.contains("trigger")) {
1630            throw std::runtime_error("rule parser missing required fields");
1631        }
1632        return common_peg_rule_parser{
1633            j["name"].get<std::string>(),
1634            j["child"].get<common_peg_parser_id>(),
1635            j["trigger"].get<bool>()
1636        };
1637    }
1638    if (type == "ref") {
1639        if (!j.contains("name") || !j["name"].is_string()) {
1640            throw std::runtime_error("ref parser missing or invalid 'name' field");
1641        }
1642        return common_peg_ref_parser{j["name"]};
1643    }
1644    if (type == "atomic") {
1645        if (!j.contains("child")) {
1646            throw std::runtime_error("tag parser missing required fields");
1647        }
1648        return common_peg_atomic_parser{
1649            j["child"].get<common_peg_parser_id>(),
1650        };
1651    }
1652    if (type == "tag") {
1653        if (!j.contains("child") || !j.contains("tag")) {
1654            throw std::runtime_error("tag parser missing required fields");
1655        }
1656        return common_peg_tag_parser{
1657            j["child"].get<common_peg_parser_id>(),
1658            j["tag"].get<std::string>(),
1659        };
1660    }
1661
1662    throw std::runtime_error("Unknown parser type: " + type);
1663}
1664
1665common_peg_arena common_peg_arena::from_json(const nlohmann::json & j) {
1666    if (!j.contains("parsers") || !j["parsers"].is_array()) {
1667        throw std::runtime_error("JSON missing or invalid 'parsers' array");
1668    }
1669    if (!j.contains("rules") || !j["rules"].is_object()) {
1670        throw std::runtime_error("JSON missing or invalid 'rules' object");
1671    }
1672    if (!j.contains("root")) {
1673        throw std::runtime_error("JSON missing 'root' field");
1674    }
1675
1676    common_peg_arena arena;
1677
1678    const auto & parsers_json = j["parsers"];
1679    arena.parsers_.reserve(parsers_json.size());
1680    for (const auto & parser_json : parsers_json) {
1681        arena.parsers_.push_back(deserialize_parser_variant(parser_json));
1682    }
1683
1684    arena.rules_ = j["rules"].get<std::unordered_map<std::string, common_peg_parser_id>>();
1685
1686    for (const auto & [name, id] : arena.rules_) {
1687        if (id >= arena.parsers_.size()) {
1688            throw std::runtime_error("Rule '" + name + "' references invalid parser ID: " + std::to_string(id));
1689        }
1690    }
1691
1692    arena.root_ = j["root"].get<common_peg_parser_id>();
1693    if (arena.root_ != COMMON_PEG_INVALID_PARSER_ID && arena.root_ >= arena.parsers_.size()) {
1694        throw std::runtime_error("Root references invalid parser ID: " + std::to_string(arena.root_));
1695    }
1696
1697    return arena;
1698}
1699
1700std::string common_peg_arena::save() const {
1701    return to_json().dump();
1702}
1703
1704void common_peg_arena::load(const std::string & data) {
1705    *this = from_json(nlohmann::json::parse(data));
1706}
1707
1708common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn) {
1709    common_peg_parser_builder builder;
1710    builder.set_root(fn(builder));
1711    return builder.build();
1712}