From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Thu, 12 Feb 2026 20:57:17 +0100 Subject: Engage! --- llama.cpp/common/peg-parser.cpp | 1712 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 1712 insertions(+) create mode 100644 llama.cpp/common/peg-parser.cpp (limited to 'llama.cpp/common/peg-parser.cpp') diff --git a/llama.cpp/common/peg-parser.cpp b/llama.cpp/common/peg-parser.cpp new file mode 100644 index 0000000..f2fc845 --- /dev/null +++ b/llama.cpp/common/peg-parser.cpp @@ -0,0 +1,1712 @@ +#include "common.h" +#include "peg-parser.h" +#include "json-schema-to-grammar.h" +#include "unicode.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +// Trick to catch missing branches +template +inline constexpr bool is_always_false_v = false; + +const char * common_peg_parse_result_type_name(common_peg_parse_result_type type) { + switch (type) { + case COMMON_PEG_PARSE_RESULT_FAIL: return "fail"; + case COMMON_PEG_PARSE_RESULT_SUCCESS: return "success"; + case COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT: return "need_more_input"; + default: return "unknown"; + } +} + +static bool is_hex_digit(const char c) { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); +} + +// Trie for matching multiple literals. +// This is used in common_peg_until_parser and to build a GBNF exclusion grammar +struct trie { + struct node { + size_t depth = 0; + std::map children; + bool is_word; + }; + + std::vector nodes; + + trie(const std::vector & words) { + create_node(); // root node + for (const auto & w : words) { + insert(w); + } + } + + enum match_result { NO_MATCH, PARTIAL_MATCH, COMPLETE_MATCH }; + + // Check if a delimiter starts at the given position + match_result check_at(std::string_view sv, size_t start_pos) const { + size_t current = 0; // Start at root + size_t pos = start_pos; + + while (pos < sv.size()) { + auto it = nodes[current].children.find(sv[pos]); + if (it == nodes[current].children.end()) { + // Can't continue matching + return match_result{match_result::NO_MATCH}; + } + + current = it->second; + pos++; + + // Check if we've matched a complete word + if (nodes[current].is_word) { + return match_result{match_result::COMPLETE_MATCH}; + } + } + + // Reached end of input while still in the trie (not at root) + if (current != 0) { + // We're in the middle of a potential match + return match_result{match_result::PARTIAL_MATCH}; + } + + // Reached end at root (no match) + return match_result{match_result::NO_MATCH}; + } + + struct prefix_and_next { + std::string prefix; + std::string next_chars; + }; + + std::vector collect_prefix_and_next() { + std::string prefix; + std::vector result; + collect_prefix_and_next(0, prefix, result); + return result; + } + + private: + void collect_prefix_and_next(size_t index, std::string & prefix, std::vector & out) { + if (!nodes[index].is_word) { + if (!nodes[index].children.empty()) { + std::string chars; + chars.reserve(nodes[index].children.size()); + for (const auto & p : nodes[index].children) { + chars.push_back(p.first); + } + out.emplace_back(prefix_and_next{prefix, chars}); + } + } + + for (const auto & p : nodes[index].children) { + unsigned char ch = p.first; + auto child = p.second; + prefix.push_back(ch); + collect_prefix_and_next(child, prefix, out); + prefix.pop_back(); + } + } + + size_t create_node() { + size_t index = nodes.size(); + nodes.emplace_back(); + return index; + } + + void insert(const std::string & word) { + size_t current = 0; + for (unsigned char ch : word) { + auto it = nodes[current].children.find(ch); + if (it == nodes[current].children.end()) { + size_t child = create_node(); + nodes[child].depth = nodes[current].depth + 1; + nodes[current].children[ch] = child; + current = child; + } else { + current = it->second; + } + } + nodes[current].is_word = true; + } +}; + +static std::pair parse_hex_escape(const std::string & str, size_t pos, int hex_count) { + if (pos + hex_count > str.length()) { + return {0, 0}; + } + + uint32_t value = 0; + for (int i = 0; i < hex_count; i++) { + char c = str[pos + i]; + if (!is_hex_digit(c)) { + return {0, 0}; + } + value <<= 4; + if ('a' <= c && c <= 'f') { + value += c - 'a' + 10; + } else if ('A' <= c && c <= 'F') { + value += c - 'A' + 10; + } else if ('0' <= c && c <= '9') { + value += c - '0'; + } else { + break; + } + } + return {value, static_cast(hex_count)}; +} + +static std::pair parse_char_class_char(const std::string & content, size_t pos) { + if (content[pos] == '\\' && pos + 1 < content.length()) { + switch (content[pos + 1]) { + case 'x': { + auto result = parse_hex_escape(content, pos + 2, 2); + if (result.second > 0) { + return {result.first, 2 + result.second}; + } + // Invalid escape, treat as literal 'x' + return {static_cast('x'), 2}; + } + case 'u': { + auto result = parse_hex_escape(content, pos + 2, 4); + if (result.second > 0) { + return {result.first, 2 + result.second}; + } + // Invalid escape, treat as literal 'u' + return {static_cast('u'), 2}; + } + case 'U': { + auto result = parse_hex_escape(content, pos + 2, 8); + if (result.second > 0) { + return {result.first, 2 + result.second}; + } + // Invalid escape, treat as literal 'U' + return {static_cast('U'), 2}; + } + case 'n': return {'\n', 2}; + case 't': return {'\t', 2}; + case 'r': return {'\r', 2}; + case '\\': return {'\\', 2}; + case ']': return {']', 2}; + case '[': return {'[', 2}; + default: return {static_cast(content[pos + 1]), 2}; + } + } + + // Regular character - return as codepoint + return {static_cast(static_cast(content[pos])), 1}; +} + +static std::pair, bool> parse_char_classes(const std::string & classes) { + std::vector ranges; + bool negated = false; + + std::string content = classes; + if (content.front() == '[') { + content = content.substr(1); + } + + if (content.back() == ']') { + content.pop_back(); + } + + // Check for negation + if (!content.empty() && content.front() == '^') { + negated = true; + content = content.substr(1); + } + + size_t i = 0; + while (i < content.length()) { + auto [start, start_len] = parse_char_class_char(content, i); + i += start_len; + + if (i + 1 < content.length() && content[i] == '-') { + // Range detected + auto [end, end_len] = parse_char_class_char(content, i + 1); + ranges.push_back(common_peg_chars_parser::char_range{start, end}); + i += 1 + end_len; + } else { + ranges.push_back(common_peg_chars_parser::char_range{start, start}); + } + } + + return {ranges, negated}; +} + +void common_peg_ast_arena::visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const { + if (id == COMMON_PEG_INVALID_AST_ID) { + return; + } + const auto & node = get(id); + visitor(node); + for (const auto & child : node.children) { + visit(child, visitor); + } +} + +void common_peg_ast_arena::visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const { + for (const auto & node : result.nodes) { + visit(node, visitor); + } +} + +struct parser_executor; + +common_peg_parser_id common_peg_arena::add_parser(common_peg_parser_variant parser) { + common_peg_parser_id id = parsers_.size(); + parsers_.push_back(std::move(parser)); + return id; +} + +void common_peg_arena::add_rule(const std::string & name, common_peg_parser_id id) { + rules_[name] = id; +} + +common_peg_parser_id common_peg_arena::get_rule(const std::string & name) const { + auto it = rules_.find(name); + if (it == rules_.end()) { + throw std::runtime_error("Rule not found: " + name); + } + return it->second; +} + +struct parser_executor { + const common_peg_arena & arena; + common_peg_parse_context & ctx; + size_t start_pos; + + parser_executor(const common_peg_arena & arena, common_peg_parse_context & ctx, size_t start) + : arena(arena), ctx(ctx), start_pos(start) {} + + common_peg_parse_result operator()(const common_peg_epsilon_parser & /* p */) const { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos); + } + + common_peg_parse_result operator()(const common_peg_start_parser & /* p */) const { + return common_peg_parse_result( + start_pos == 0 ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL, + start_pos + ); + } + + common_peg_parse_result operator()(const common_peg_end_parser & /* p */) const { + return common_peg_parse_result( + start_pos >= ctx.input.size() ? COMMON_PEG_PARSE_RESULT_SUCCESS : COMMON_PEG_PARSE_RESULT_FAIL, + start_pos + ); + } + + common_peg_parse_result operator()(const common_peg_literal_parser & p) { + auto pos = start_pos; + for (auto i = 0u; i < p.literal.size(); ++i) { + if (pos >= ctx.input.size()) { + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); + } + if (ctx.input[pos] != p.literal[i]) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + ++pos; + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + + common_peg_parse_result operator()(const common_peg_sequence_parser & p) { + auto pos = start_pos; + std::vector nodes; + + for (const auto & child_id : p.children) { + auto result = arena.parse(child_id, ctx, pos); + if (result.fail()) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end); + } + + if (!result.nodes.empty()) { + nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end()); + } + + if (result.need_more_input()) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes)); + } + + pos = result.end; + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes)); + } + + common_peg_parse_result operator()(const common_peg_choice_parser & p) { + auto pos = start_pos; + for (const auto & child_id : p.children) { + auto result = arena.parse(child_id, ctx, pos); + if (!result.fail()) { + return result; + } + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + + common_peg_parse_result operator()(const common_peg_repetition_parser & p) { + auto pos = start_pos; + int match_count = 0; + std::vector nodes; + + // Try to match up to max_count times (or unlimited if max_count is -1) + while (p.max_count == -1 || match_count < p.max_count) { + if (pos >= ctx.input.size()) { + break; + } + + auto result = arena.parse(p.child, ctx, pos); + + if (result.success()) { + // Prevent infinite loop on empty matches + if (result.end == pos) { + break; + } + + if (!result.nodes.empty()) { + nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end()); + } + + pos = result.end; + match_count++; + continue; + } + + if (result.need_more_input()) { + if (!result.nodes.empty()) { + nodes.insert(nodes.end(), result.nodes.begin(), result.nodes.end()); + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes)); + } + + // Child failed - stop trying + break; + } + + // Check if we got enough matches + if (p.min_count > 0 && match_count < p.min_count) { + if (pos >= ctx.input.size() && ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes)); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes)); + } + + common_peg_parse_result operator()(const common_peg_and_parser & p) { + auto result = arena.parse(p.child, ctx, start_pos); + // Pass result but don't consume input + return common_peg_parse_result(result.type, start_pos); + } + + common_peg_parse_result operator()(const common_peg_not_parser & p) { + auto result = arena.parse(p.child, ctx, start_pos); + + if (result.success()) { + // Fail if the underlying parser matches + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + + if (result.need_more_input()) { + // Propagate - need to know what child would match before negating + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos); + } + + // Child failed, so negation succeeds + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos); + } + + common_peg_parse_result operator()(const common_peg_any_parser & /* p */) const { + // Parse a single UTF-8 codepoint (not just a single byte) + auto result = parse_utf8_codepoint(ctx.input, start_pos); + + if (result.status == utf8_parse_result::INCOMPLETE) { + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos); + } + if (result.status == utf8_parse_result::INVALID) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, start_pos + result.bytes_consumed); + } + + common_peg_parse_result operator()(const common_peg_space_parser & /* p */) { + auto pos = start_pos; + while (pos < ctx.input.size()) { + auto c = static_cast(ctx.input[pos]); + if (std::isspace(c)) { + ++pos; + } else { + break; + } + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + + common_peg_parse_result operator()(const common_peg_chars_parser & p) const { + auto pos = start_pos; + int match_count = 0; + + // Try to match up to max_count times (or unlimited if max_count is -1) + while (p.max_count == -1 || match_count < p.max_count) { + auto result = parse_utf8_codepoint(ctx.input, pos); + + if (result.status == utf8_parse_result::INCOMPLETE) { + if (match_count >= p.min_count) { + // We have enough matches, succeed with what we have + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + // Not enough matches yet + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); + } + + if (result.status == utf8_parse_result::INVALID) { + // Malformed UTF-8 in input + if (match_count >= p.min_count) { + // We have enough matches, succeed up to here + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + // Not enough matches, fail + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + + // Check if this codepoint matches our character class + bool matches = false; + for (const auto & range : p.ranges) { + if (range.contains(result.codepoint)) { + matches = true; + break; + } + } + + // If negated, invert the match result + if (p.negated) { + matches = !matches; + } + + if (matches) { + pos += result.bytes_consumed; + ++match_count; + } else { + // Character doesn't match, stop matching + break; + } + } + + // Check if we got enough matches + if (match_count < p.min_count) { + if (pos >= ctx.input.size() && ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); + } + + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + + static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) { + ++pos; // consume '\' + if (pos >= ctx.input.size()) { + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos); + } + + switch (ctx.input[pos]) { + case '"': + case '\\': + case '/': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + ++pos; + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos); + case 'u': + return handle_unicode_escape(ctx, start, pos); + default: + // Invalid escape sequence + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); + } + } + + static common_peg_parse_result handle_unicode_escape(common_peg_parse_context & ctx, size_t start, size_t & pos) { + ++pos; // consume 'u' + for (int i = 0; i < 4; ++i) { + if (pos >= ctx.input.size()) { + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos); + } + if (!is_hex_digit(ctx.input[pos])) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start); + } + ++pos; + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start, pos); + } + + common_peg_parse_result operator()(const common_peg_json_string_parser & /* p */) { + auto pos = start_pos; + + // Parse string content (without quotes) + while (pos < ctx.input.size()) { + char c = ctx.input[pos]; + + if (c == '"') { + // Found closing quote - success (don't consume it) + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + + if (c == '\\') { + auto result = handle_escape_sequence(ctx, start_pos, pos); + if (!result.success()) { + return result; + } + } else { + auto utf8_result = parse_utf8_codepoint(ctx.input, pos); + + if (utf8_result.status == utf8_parse_result::INCOMPLETE) { + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); + } + + if (utf8_result.status == utf8_parse_result::INVALID) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + + pos += utf8_result.bytes_consumed; + } + } + + // Reached end without finding closing quote + if (!ctx.is_partial) { + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos); + } + + common_peg_parse_result operator()(const common_peg_until_parser & p) const { + trie matcher(p.delimiters); + + // Scan input and check for delimiters + size_t pos = start_pos; + size_t last_valid_pos = start_pos; + + while (pos < ctx.input.size()) { + auto utf8_result = parse_utf8_codepoint(ctx.input, pos); + + if (utf8_result.status == utf8_parse_result::INCOMPLETE) { + // Incomplete UTF-8 sequence + if (!ctx.is_partial) { + // Input is complete but UTF-8 is incomplete = malformed + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + // Return what we have so far (before incomplete sequence) + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos); + } + + if (utf8_result.status == utf8_parse_result::INVALID) { + // Malformed UTF-8 + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos); + } + + // Check if a delimiter starts at this position + auto match = matcher.check_at(ctx.input, pos); + + if (match == trie::COMPLETE_MATCH) { + // Found a complete delimiter, return everything before it + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + + if (match == trie::PARTIAL_MATCH) { + // Found a partial match extending to end of input, return everything before it + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos); + } + + pos += utf8_result.bytes_consumed; + last_valid_pos = pos; + } + + if (last_valid_pos == ctx.input.size() && ctx.is_partial) { + // Reached the end of a partial stream, there might still be more input that we need to consume. + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos); + } + return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, last_valid_pos); + } + + common_peg_parse_result operator()(const common_peg_schema_parser & p) { + return arena.parse(p.child, ctx, start_pos); + } + + common_peg_parse_result operator()(const common_peg_rule_parser & p) { + // Parse the child + auto result = arena.parse(p.child, ctx, start_pos); + + if (!result.fail()) { + std::string_view text; + if (result.start < ctx.input.size()) { + text = std::string_view(ctx.input).substr(result.start, result.end - result.start); + } + + auto node_id = ctx.ast.add_node( + p.name, + "", + result.start, + result.end, + text, + std::move(result.nodes), + result.need_more_input() + ); + + return common_peg_parse_result(result.type, result.start, result.end, { node_id }); + } + + return result; + } + + common_peg_parse_result operator()(const common_peg_tag_parser & p) { + // Parse the child + auto result = arena.parse(p.child, ctx, start_pos); + + if (!result.fail()) { + std::string_view text; + if (result.start < ctx.input.size()) { + text = std::string_view(ctx.input).substr(result.start, result.end - result.start); + } + + auto node_id = ctx.ast.add_node( + "", + p.tag, + result.start, + result.end, + text, + std::move(result.nodes), + result.need_more_input() + ); + + return common_peg_parse_result(result.type, result.start, result.end, { node_id }); + } + + return result; + } + + common_peg_parse_result operator()(const common_peg_ref_parser & p) { + auto rule_id = arena.get_rule(p.name); + return arena.parse(rule_id, ctx, start_pos); + } + + common_peg_parse_result operator()(const common_peg_atomic_parser & p) { + auto result = arena.parse(p.child, ctx, start_pos); + if (result.need_more_input()) { + // Clear nodes so they don't propagate up. + result.nodes.clear(); + } + return result; + } +}; + +common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const { + if (root_ == COMMON_PEG_INVALID_PARSER_ID) { + throw std::runtime_error("No root parser set"); + } + return parse(root_, ctx, start); +} + +common_peg_parse_result common_peg_arena::parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const { + // Execute parser + const auto & parser = parsers_.at(id); + parser_executor exec(*this, ctx, start); + return std::visit(exec, parser); +} + +common_peg_parser_id common_peg_arena::resolve_ref(common_peg_parser_id id) { + const auto & parser = parsers_.at(id); + if (auto ref = std::get_if(&parser)) { + return get_rule(ref->name); + } + return id; +} + +void common_peg_arena::resolve_refs() { + // Walk through all parsers and replace refs with their corresponding rule IDs + for (auto & parser : parsers_) { + std::visit([this](auto & p) { + using T = std::decay_t; + + if constexpr (std::is_same_v) { + for (auto & child : p.children) { + child = resolve_ref(child); + } + } else if constexpr (std::is_same_v) { + for (auto & child : p.children) { + child = resolve_ref(child); + } + } else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + p.child = resolve_ref(p.child); + } else if constexpr (std::is_same_v) { + p.child = resolve_ref(p.child); + } else if constexpr (std::is_same_v) { + p.child = resolve_ref(p.child); + } else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + // These rules do not have children + } else { + static_assert(is_always_false_v); + } + }, parser); + } + + // Also flatten root if it's a ref + if (root_ != COMMON_PEG_INVALID_PARSER_ID) { + root_ = resolve_ref(root_); + } +} + +std::string common_peg_arena::dump(common_peg_parser_id id) const { + const auto & parser = parsers_.at(id); + + return std::visit([this](const auto & p) -> std::string { + using T = std::decay_t; + + if constexpr (std::is_same_v) { + return "Epsilon"; + } else if constexpr (std::is_same_v) { + return "Start"; + } else if constexpr (std::is_same_v) { + return "End"; + } else if constexpr (std::is_same_v) { + return "Literal(" + p.literal + ")"; + } else if constexpr (std::is_same_v) { + std::vector parts; + for (const auto & child : p.children) { + parts.push_back(dump(child)); + } + return "Sequence(" + string_join(parts, ", ") + ")"; + } else if constexpr (std::is_same_v) { + std::vector parts; + for (const auto & child : p.children) { + parts.push_back(dump(child)); + } + return "Choice(" + string_join(parts, ", ") + ")"; + } else if constexpr (std::is_same_v) { + if (p.max_count == -1) { + return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", unbounded)"; + } + return "Repetition(" + dump(p.child) + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")"; + } else if constexpr (std::is_same_v) { + return "And(" + dump(p.child) + ")"; + } else if constexpr (std::is_same_v) { + return "Not(" + dump(p.child) + ")"; + } else if constexpr (std::is_same_v) { + return "Any"; + } else if constexpr (std::is_same_v) { + return "Space"; + } else if constexpr (std::is_same_v) { + if (p.max_count == -1) { + return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", unbounded)"; + } + return "CharRepeat(" + p.pattern + ", " + std::to_string(p.min_count) + ", " + std::to_string(p.max_count) + ")"; + } else if constexpr (std::is_same_v) { + return "JsonString()"; + } else if constexpr (std::is_same_v) { + return "Until(" + string_join(p.delimiters, " | ") + ")"; + } else if constexpr (std::is_same_v) { + return "Schema(" + dump(p.child) + ", " + (p.schema ? p.schema->dump() : "null") + ")"; + } else if constexpr (std::is_same_v) { + return "Rule(" + p.name + ", " + dump(p.child) + ")"; + } else if constexpr (std::is_same_v) { + return "Ref(" + p.name + ")"; + } else { + return "Unknown"; + } + }, parser); +} + +common_peg_parser & common_peg_parser::operator=(const common_peg_parser & other) { + id_ = other.id_; + return *this; +} + +common_peg_parser & common_peg_parser::operator+=(const common_peg_parser & other) { + id_ = builder_.sequence({id_, other.id_}); + return *this; +} + +common_peg_parser & common_peg_parser::operator|=(const common_peg_parser & other) { + id_ = builder_.choice({id_, other.id_}); + return *this; +} + +common_peg_parser common_peg_parser::operator+(const common_peg_parser & other) const { + return builder_.sequence({id_, other.id_}); +} + +common_peg_parser common_peg_parser::operator|(const common_peg_parser & other) const { + return builder_.choice({id_, other.id_}); +} + +common_peg_parser common_peg_parser::operator<<(const common_peg_parser & other) const { + return builder_.sequence({id_, builder_.space(), other.id_}); +} + +common_peg_parser common_peg_parser::operator+(const char * str) const { + return *this + builder_.literal(str); +} + +common_peg_parser common_peg_parser::operator+(const std::string & str) const { + return *this + builder_.literal(str); +} + +common_peg_parser common_peg_parser::operator<<(const char * str) const { + return *this << builder_.literal(str); +} + +common_peg_parser common_peg_parser::operator<<(const std::string & str) const { + return *this << builder_.literal(str); +} + +common_peg_parser common_peg_parser::operator|(const char * str) const { + return *this | builder_.literal(str); +} + +common_peg_parser common_peg_parser::operator|(const std::string & str) const { + return *this | builder_.literal(str); +} + +common_peg_parser operator+(const char * str, const common_peg_parser & p) { + return p.builder().literal(str) + p; +} + +common_peg_parser operator+(const std::string & str, const common_peg_parser & p) { + return operator+(str.c_str(), p); +} + +common_peg_parser operator<<(const char * str, const common_peg_parser & p) { + return p.builder().literal(str) << p; +} + +common_peg_parser operator<<(const std::string & str, const common_peg_parser & p) { + return operator<<(str.c_str(), p); +} + +common_peg_parser operator|(const char * str, const common_peg_parser & p) { + return p.builder().literal(str) | p; +} + +common_peg_parser operator|(const std::string & str, const common_peg_parser & p) { + return operator|(str.c_str(), p); +} + +static std::string rule_name(const std::string & name) { + static const std::regex invalid_rule_chars_re("[^a-zA-Z0-9-]+"); + return std::regex_replace(name, invalid_rule_chars_re, "-"); +} + +common_peg_parser_builder::common_peg_parser_builder() {} + +common_peg_parser common_peg_parser_builder::sequence(const std::vector & parsers) { + // Flatten nested sequences + std::vector flattened; + for (const auto & p : parsers) { + const auto & parser = arena_.get(p); + if (auto seq = std::get_if(&parser)) { + flattened.insert(flattened.end(), seq->children.begin(), seq->children.end()); + } else { + flattened.push_back(p); + } + } + return wrap(arena_.add_parser(common_peg_sequence_parser{flattened})); +} + +common_peg_parser common_peg_parser_builder::sequence(const std::vector & parsers) { + std::vector ids; + ids.reserve(parsers.size()); + for (const auto & p : parsers) { + ids.push_back(p.id()); + } + return sequence(ids); +} + +common_peg_parser common_peg_parser_builder::sequence(std::initializer_list parsers) { + std::vector ids; + ids.reserve(parsers.size()); + for (const auto & p : parsers) { + ids.push_back(p.id()); + } + return sequence(ids); +} + +common_peg_parser common_peg_parser_builder::choice(const std::vector & parsers) { + // Flatten nested choices + std::vector flattened; + for (const auto & p : parsers) { + const auto & parser = arena_.get(p); + if (auto choice = std::get_if(&parser)) { + flattened.insert(flattened.end(), choice->children.begin(), choice->children.end()); + } else { + flattened.push_back(p); + } + } + return wrap(arena_.add_parser(common_peg_choice_parser{flattened})); +} + +common_peg_parser common_peg_parser_builder::choice(const std::vector & parsers) { + std::vector ids; + ids.reserve(parsers.size()); + for (const auto & p : parsers) { + ids.push_back(p.id()); + } + return choice(ids); +} + +common_peg_parser common_peg_parser_builder::choice(std::initializer_list parsers) { + std::vector ids; + ids.reserve(parsers.size()); + for (const auto & p : parsers) { + ids.push_back(p.id()); + } + return choice(ids); +} + +common_peg_parser common_peg_parser_builder::chars(const std::string & classes, int min, int max) { + auto [ranges, negated] = parse_char_classes(classes); + return wrap(arena_.add_parser(common_peg_chars_parser{classes, ranges, negated, min, max})); +} + +common_peg_parser common_peg_parser_builder::schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw) { + return wrap(arena_.add_parser(common_peg_schema_parser{p.id(), name, std::make_shared(schema), raw})); +} + +common_peg_parser common_peg_parser_builder::rule(const std::string & name, const common_peg_parser & p, bool trigger) { + auto clean_name = rule_name(name); + auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, p.id(), trigger}); + arena_.add_rule(clean_name, rule_id); + return ref(clean_name); +} + +common_peg_parser common_peg_parser_builder::rule(const std::string & name, const std::function & builder_fn, bool trigger) { + auto clean_name = rule_name(name); + if (arena_.has_rule(clean_name)) { + return ref(clean_name); + } + + // Create placeholder rule to allow recursive references + auto placeholder = any(); // Temporary placeholder + auto placeholder_rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, placeholder.id(), trigger}); + arena_.add_rule(clean_name, placeholder_rule_id); + + // Build the actual parser + auto parser = builder_fn(); + + // Replace placeholder with actual rule + auto rule_id = arena_.add_parser(common_peg_rule_parser{clean_name, parser.id(), trigger}); + arena_.rules_[clean_name] = rule_id; + + return ref(clean_name); +} + +void common_peg_parser_builder::set_root(const common_peg_parser & p) { + arena_.set_root(p.id()); +} + +common_peg_arena common_peg_parser_builder::build() { + arena_.resolve_refs(); + return std::move(arena_); +} + +// JSON parsers +common_peg_parser common_peg_parser_builder::json_number() { + return rule("json-number", [this]() { + auto digit1_9 = chars("[1-9]", 1, 1); + auto digits = chars("[0-9]"); + auto int_part = choice({literal("0"), sequence({digit1_9, chars("[0-9]", 0, -1)})}); + auto frac = sequence({literal("."), digits}); + auto exp = sequence({choice({literal("e"), literal("E")}), optional(chars("[+-]", 1, 1)), digits}); + return sequence({optional(literal("-")), int_part, optional(frac), optional(exp), space()}); + }); +} + +common_peg_parser common_peg_parser_builder::json_string() { + return rule("json-string", [this]() { + return sequence({literal("\""), json_string_content(), literal("\""), space()}); + }); +} + +common_peg_parser common_peg_parser_builder::json_bool() { + return rule("json-bool", [this]() { + return sequence({choice({literal("true"), literal("false")}), space()}); + }); +} + +common_peg_parser common_peg_parser_builder::json_null() { + return rule("json-null", [this]() { + return sequence({literal("null"), space()}); + }); +} + +common_peg_parser common_peg_parser_builder::json_object() { + return rule("json-object", [this]() { + auto ws = space(); + auto member = sequence({json_string(), ws, literal(":"), ws, json()}); + auto members = sequence({member, zero_or_more(sequence({ws, literal(","), ws, member}))}); + return sequence({ + literal("{"), + ws, + choice({ + literal("}"), + sequence({members, ws, literal("}")}) + }), + ws + }); + }); +} + +common_peg_parser common_peg_parser_builder::json_array() { + return rule("json-array", [this]() { + auto ws = space(); + auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))}); + return sequence({ + literal("["), + ws, + choice({ + literal("]"), + sequence({elements, ws, literal("]")}) + }), + ws + }); + }); +} + +common_peg_parser common_peg_parser_builder::json() { + return rule("json-value", [this]() { + return choice({ + json_object(), + json_array(), + json_string(), + json_number(), + json_bool(), + json_null() + }); + }); +} + +common_peg_parser common_peg_parser_builder::json_string_content() { + return wrap(arena_.add_parser(common_peg_json_string_parser{})); +} + +common_peg_parser common_peg_parser_builder::json_member(const std::string & key, const common_peg_parser & p) { + auto ws = space(); + return sequence({ + literal("\"" + key + "\""), + ws, + literal(":"), + ws, + p, + }); +} + + +static std::string gbnf_escape_char_class(char c) { + switch (c) { + case '\n': return "\\n"; + case '\t': return "\\t"; + case '\r': return "\\r"; + case '\\': return "\\\\"; + case ']': return "\\]"; + case '[': return "\\["; + default: return std::string(1, c); + } +} + +static std::string gbnf_excluding_pattern(const std::vector & strings) { + trie matcher(strings); + auto pieces = matcher.collect_prefix_and_next(); + + std::string pattern; + for (size_t i = 0; i < pieces.size(); ++i) { + if (i > 0) { + pattern += " | "; + } + + const auto & pre = pieces[i].prefix; + const auto & chars = pieces[i].next_chars; + + std::string cls; + cls.reserve(chars.size()); + for (const auto & ch : chars) { + cls += gbnf_escape_char_class(ch); + } + + if (!pre.empty()) { + pattern += gbnf_format_literal(pre) + " [^" + cls + "]"; + } else { + pattern += "[^" + cls + "]"; + } + } + + return "(" + pattern + ")*"; +} + +static std::unordered_set collect_reachable_rules( + const common_peg_arena & arena, + const common_peg_parser_id & rule +) { + std::unordered_set reachable; + std::unordered_set visited; + + std::function visit = [&](common_peg_parser_id id) { + const auto & parser = arena.get(id); + + std::visit([&](const auto & p) { + using T = std::decay_t; + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + // These parsers do not have any children + } else if constexpr (std::is_same_v) { + for (auto child : p.children) { + visit(child); + } + } else if constexpr (std::is_same_v) { + for (auto child : p.children) { + visit(child); + } + } else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + visit(p.child); + } else if constexpr (std::is_same_v) { + if (visited.find(p.name) == visited.end()) { + visited.insert(p.name); + reachable.insert(p.name); + visit(p.child); + } + } else if constexpr (std::is_same_v) { + // Traverse rules so we pick up everything + auto referenced_rule = arena.get_rule(p.name); + visit(referenced_rule); + } else { + static_assert(is_always_false_v); + } + }, parser); + }; + + visit(rule); + return reachable; +} + +// GBNF generation implementation +void common_peg_arena::build_grammar(const common_grammar_builder & builder, bool lazy) const { + // Generate GBNF for a parser + std::function to_gbnf = [&](common_peg_parser_id id) -> std::string { + const auto & parser = parsers_.at(id); + + return std::visit([&](const auto & p) -> std::string { + using T = std::decay_t; + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + return ""; + } else if constexpr (std::is_same_v) { + return gbnf_format_literal(p.literal); + } else if constexpr (std::is_same_v) { + std::string s; + for (const auto & child : p.children) { + if (!s.empty()) { + s += " "; + } + auto child_gbnf = to_gbnf(child); + const auto & child_parser = parsers_.at(child); + if (std::holds_alternative(child_parser) || + std::holds_alternative(child_parser)) { + s += "(" + child_gbnf + ")"; + } else { + s += child_gbnf; + } + } + return s; + } else if constexpr (std::is_same_v) { + std::string s; + for (const auto & child : p.children) { + if (!s.empty()) { + s += " | "; + } + auto child_gbnf = to_gbnf(child); + const auto & child_parser = parsers_.at(child); + if (std::holds_alternative(child_parser)) { + s += "(" + child_gbnf + ")"; + } else { + s += child_gbnf; + } + } + return s; + } else if constexpr (std::is_same_v) { + auto child_gbnf = to_gbnf(p.child); + const auto & child_parser = parsers_.at(p.child); + if (std::holds_alternative(child_parser) || + std::holds_alternative(child_parser)) { + child_gbnf = "(" + child_gbnf + ")"; + } + if (p.min_count == 0 && p.max_count == 1) { + return child_gbnf + "?"; + } + if (p.min_count == 0 && p.max_count == -1) { + return child_gbnf + "*"; + } + if (p.min_count == 1 && p.max_count == -1) { + return child_gbnf + "+"; + } + if (p.max_count == -1) { + return child_gbnf + "{" + std::to_string(p.min_count) + ",}"; + } + if (p.min_count == p.max_count) { + if (p.min_count == 1) { + return child_gbnf; + } + return child_gbnf + "{" + std::to_string(p.min_count) + "}"; + } + return child_gbnf + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}"; + } else if constexpr (std::is_same_v || std::is_same_v) { + return ""; // Lookahead not supported in GBNF + } else if constexpr (std::is_same_v) { + return "."; + } else if constexpr (std::is_same_v) { + return "space"; + } else if constexpr (std::is_same_v) { + std::string result = p.pattern; + if (p.min_count == 0 && p.max_count == 1) { + return result + "?"; + } + if (p.min_count == 0 && p.max_count == -1) { + return result + "*"; + } + if (p.min_count == 1 && p.max_count == -1) { + return result + "+"; + } + if (p.max_count == -1) { + return result + "{" + std::to_string(p.min_count) + ",}"; + } + if (p.min_count == p.max_count) { + if (p.min_count == 1) { + return result; + } + return result + "{" + std::to_string(p.min_count) + "}"; + } + return result + "{" + std::to_string(p.min_count) + "," + std::to_string(p.max_count) + "}"; + } else if constexpr (std::is_same_v) { + return R"(( [^"\\] | "\\" ( ["\\/ bfnrt] | "u" [0-9a-fA-F]{4} ) )*)"; + } else if constexpr (std::is_same_v) { + if (p.delimiters.empty()) { + return ".*"; + } + return gbnf_excluding_pattern(p.delimiters); + } else if constexpr (std::is_same_v) { + if (p.schema) { + if (p.raw && p.schema->contains("type") && p.schema->at("type").is_string() && p.schema->at("type") == "string") { + // TODO: Implement more comprehensive grammar generation for raw strings. + // For now, use the grammar emitted from the underlying parser. + return to_gbnf(p.child); + } + return builder.add_schema(p.name, *p.schema); + } + return to_gbnf(p.child); + } else if constexpr (std::is_same_v) { + return p.name; + } else if constexpr (std::is_same_v) { + // Refs should not exist after flattening, but kept just in case + return p.name; + } else if constexpr (std::is_same_v) { + return to_gbnf(p.child); + } else if constexpr (std::is_same_v) { + return to_gbnf(p.child); + } else { + static_assert(is_always_false_v); + } + }, parser); + }; + + // Collect reachable rules + std::unordered_set reachable_rules; + + if (lazy) { + // Collect rules reachable from trigger rules + for (const auto & [name, id] : rules_) { + const auto & parser = parsers_.at(id); + if (auto rule = std::get_if(&parser)) { + if (rule->trigger) { + // Mark trigger as reachable and visit it + reachable_rules.insert(name); + auto add_rules = collect_reachable_rules(*this, id); + reachable_rules.insert(add_rules.begin(), add_rules.end()); + } + } + } + } else { + // Collect rules reachable from root + reachable_rules = collect_reachable_rules(*this, root_); + } + + // Create GBNF rules for all reachable rules + for (const auto & [name, rule_id] : rules_) { + if (reachable_rules.find(name) == reachable_rules.end()) { + continue; + } + + const auto & parser = parsers_.at(rule_id); + if (auto rule = std::get_if(&parser)) { + builder.add_rule(rule->name, to_gbnf(rule->child)); + } + } + + if (lazy) { + // Generate root rule from trigger rules only + std::vector trigger_names; + for (const auto & [name, rule_id] : rules_) { + const auto & parser = parsers_.at(rule_id); + if (auto rule = std::get_if(&parser)) { + if (rule->trigger) { + trigger_names.push_back(rule->name); + } + } + } + + // Sort for predictable order + std::sort(trigger_names.begin(), trigger_names.end()); + builder.add_rule("root", string_join(trigger_names, " | ")); + } else if (root_ != COMMON_PEG_INVALID_PARSER_ID) { + builder.add_rule("root", to_gbnf(root_)); + } +} + +static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & variant) { + using json = nlohmann::json; + + return std::visit([](const auto & p) -> json { + using T = std::decay_t; + + if constexpr (std::is_same_v) { + return json{{"type", "epsilon"}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "start"}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "end"}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "literal"}, {"literal", p.literal}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "sequence"}, {"children", p.children}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "choice"}, {"children", p.children}}; + } else if constexpr (std::is_same_v) { + return json{ + {"type", "repetition"}, + {"child", p.child}, + {"min_count", p.min_count}, + {"max_count", p.max_count} + }; + } else if constexpr (std::is_same_v) { + return json{{"type", "and"}, {"child", p.child}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "not"}, {"child", p.child}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "any"}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "space"}}; + } else if constexpr (std::is_same_v) { + json ranges = json::array(); + for (const auto & range : p.ranges) { + ranges.push_back({{"start", range.start}, {"end", range.end}}); + } + return json{ + {"type", "chars"}, + {"pattern", p.pattern}, + {"ranges", ranges}, + {"negated", p.negated}, + {"min_count", p.min_count}, + {"max_count", p.max_count} + }; + } else if constexpr (std::is_same_v) { + return json{{"type", "json_string"}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "until"}, {"delimiters", p.delimiters}}; + } else if constexpr (std::is_same_v) { + return json{ + {"type", "schema"}, + {"child", p.child}, + {"name", p.name}, + {"schema", p.schema ? *p.schema : nullptr}, + {"raw", p.raw} + }; + } else if constexpr (std::is_same_v) { + return json{ + {"type", "rule"}, + {"name", p.name}, + {"child", p.child}, + {"trigger", p.trigger} + }; + } else if constexpr (std::is_same_v) { + return json{{"type", "ref"}, {"name", p.name}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "atomic"}, {"child", p.child}}; + } else if constexpr (std::is_same_v) { + return json{ + {"type", "tag"}, + {"child", p.child}, + {"tag", p.tag} + }; + } + }, variant); +} + +nlohmann::json common_peg_arena::to_json() const { + auto parsers = nlohmann::json::array(); + for (const auto & parser : parsers_) { + parsers.push_back(serialize_parser_variant(parser)); + } + return nlohmann::json{ + {"parsers", parsers}, + {"rules", rules_}, + {"root", root_} + }; +} + +static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json & j) { + if (!j.contains("type") || !j["type"].is_string()) { + throw std::runtime_error("Parser variant JSON missing or invalid 'type' field"); + } + + std::string type = j["type"]; + + if (type == "epsilon") { + return common_peg_epsilon_parser{}; + } + if (type == "start") { + return common_peg_start_parser{}; + } + if (type == "end") { + return common_peg_end_parser{}; + } + if (type == "literal") { + if (!j.contains("literal") || !j["literal"].is_string()) { + throw std::runtime_error("literal parser missing or invalid 'literal' field"); + } + return common_peg_literal_parser{j["literal"]}; + } + if (type == "sequence") { + if (!j.contains("children") || !j["children"].is_array()) { + throw std::runtime_error("sequence parser missing or invalid 'children' field"); + } + return common_peg_sequence_parser{j["children"].get>()}; + } + if (type == "choice") { + if (!j.contains("children") || !j["children"].is_array()) { + throw std::runtime_error("choice parser missing or invalid 'children' field"); + } + return common_peg_choice_parser{j["children"].get>()}; + } + if (type == "repetition") { + if (!j.contains("child") || !j.contains("min_count") || !j.contains("max_count")) { + throw std::runtime_error("repetition parser missing required fields"); + } + return common_peg_repetition_parser{ + j["child"].get(), + j["min_count"].get(), + j["max_count"].get() + }; + } + if (type == "and") { + if (!j.contains("child")) { + throw std::runtime_error("and parser missing 'child' field"); + } + return common_peg_and_parser{j["child"].get()}; + } + if (type == "not") { + if (!j.contains("child")) { + throw std::runtime_error("not parser missing 'child' field"); + } + return common_peg_not_parser{j["child"].get()}; + } + if (type == "any") { + return common_peg_any_parser{}; + } + if (type == "space") { + return common_peg_space_parser{}; + } + if (type == "chars") { + if (!j.contains("pattern") || !j.contains("ranges") || !j.contains("negated") || + !j.contains("min_count") || !j.contains("max_count")) { + throw std::runtime_error("chars parser missing required fields"); + } + common_peg_chars_parser parser; + parser.pattern = j["pattern"]; + parser.negated = j["negated"]; + parser.min_count = j["min_count"]; + parser.max_count = j["max_count"]; + for (const auto & range_json : j["ranges"]) { + if (!range_json.contains("start") || !range_json.contains("end")) { + throw std::runtime_error("char_range missing 'start' or 'end' field"); + } + parser.ranges.push_back({ + range_json["start"].get(), + range_json["end"].get() + }); + } + return parser; + } + if (type == "json_string") { + return common_peg_json_string_parser{}; + } + if (type == "until") { + if (!j.contains("delimiters") || !j["delimiters"].is_array()) { + throw std::runtime_error("until parser missing or invalid 'delimiters' field"); + } + return common_peg_until_parser{j["delimiters"].get>()}; + } + if (type == "schema") { + if (!j.contains("child") || !j.contains("name") || !j.contains("schema") || !j.contains("raw")) { + throw std::runtime_error("schema parser missing required fields"); + } + common_peg_schema_parser parser; + parser.child = j["child"].get(); + parser.name = j["name"]; + if (!j["schema"].is_null()) { + parser.schema = std::make_shared(j["schema"]); + } + parser.raw = j["raw"].get(); + return parser; + } + if (type == "rule") { + if (!j.contains("name") || !j.contains("child") || !j.contains("trigger")) { + throw std::runtime_error("rule parser missing required fields"); + } + return common_peg_rule_parser{ + j["name"].get(), + j["child"].get(), + j["trigger"].get() + }; + } + if (type == "ref") { + if (!j.contains("name") || !j["name"].is_string()) { + throw std::runtime_error("ref parser missing or invalid 'name' field"); + } + return common_peg_ref_parser{j["name"]}; + } + if (type == "atomic") { + if (!j.contains("child")) { + throw std::runtime_error("tag parser missing required fields"); + } + return common_peg_atomic_parser{ + j["child"].get(), + }; + } + if (type == "tag") { + if (!j.contains("child") || !j.contains("tag")) { + throw std::runtime_error("tag parser missing required fields"); + } + return common_peg_tag_parser{ + j["child"].get(), + j["tag"].get(), + }; + } + + throw std::runtime_error("Unknown parser type: " + type); +} + +common_peg_arena common_peg_arena::from_json(const nlohmann::json & j) { + if (!j.contains("parsers") || !j["parsers"].is_array()) { + throw std::runtime_error("JSON missing or invalid 'parsers' array"); + } + if (!j.contains("rules") || !j["rules"].is_object()) { + throw std::runtime_error("JSON missing or invalid 'rules' object"); + } + if (!j.contains("root")) { + throw std::runtime_error("JSON missing 'root' field"); + } + + common_peg_arena arena; + + const auto & parsers_json = j["parsers"]; + arena.parsers_.reserve(parsers_json.size()); + for (const auto & parser_json : parsers_json) { + arena.parsers_.push_back(deserialize_parser_variant(parser_json)); + } + + arena.rules_ = j["rules"].get>(); + + for (const auto & [name, id] : arena.rules_) { + if (id >= arena.parsers_.size()) { + throw std::runtime_error("Rule '" + name + "' references invalid parser ID: " + std::to_string(id)); + } + } + + arena.root_ = j["root"].get(); + if (arena.root_ != COMMON_PEG_INVALID_PARSER_ID && arena.root_ >= arena.parsers_.size()) { + throw std::runtime_error("Root references invalid parser ID: " + std::to_string(arena.root_)); + } + + return arena; +} + +std::string common_peg_arena::save() const { + return to_json().dump(); +} + +void common_peg_arena::load(const std::string & data) { + *this = from_json(nlohmann::json::parse(data)); +} + +common_peg_arena build_peg_parser(const std::function & fn) { + common_peg_parser_builder builder; + builder.set_root(fn(builder)); + return builder.build(); +} -- cgit v1.2.3