1#pragma once
  2
  3#include <nlohmann/json_fwd.hpp>
  4
  5#include <memory>
  6#include <unordered_map>
  7#include <string>
  8#include <string_view>
  9#include <functional>
 10#include <vector>
 11#include <variant>
 12
 13struct common_grammar_builder;
 14
 15class common_peg_parser_builder;
 16
 17using common_peg_parser_id = size_t;
 18constexpr common_peg_parser_id COMMON_PEG_INVALID_PARSER_ID = static_cast<common_peg_parser_id>(-1);
 19
 20using common_peg_ast_id = size_t;
 21constexpr common_peg_ast_id COMMON_PEG_INVALID_AST_ID = static_cast<common_peg_ast_id>(-1);
 22
 23// Lightweight wrapper around common_peg_parser_id for convenience
 24class common_peg_parser {
 25    common_peg_parser_id id_;
 26    common_peg_parser_builder & builder_;
 27
 28  public:
 29    common_peg_parser(const common_peg_parser & other) : id_(other.id_), builder_(other.builder_) {}
 30    common_peg_parser(common_peg_parser_id id, common_peg_parser_builder & builder) : id_(id), builder_(builder) {}
 31
 32    common_peg_parser & operator=(const common_peg_parser & other);
 33    common_peg_parser & operator+=(const common_peg_parser & other);
 34    common_peg_parser & operator|=(const common_peg_parser & other);
 35
 36    operator common_peg_parser_id() const { return id_; }
 37    common_peg_parser_id id() const { return id_; }
 38
 39    common_peg_parser_builder & builder() const { return builder_; }
 40
 41    // Creates a sequence
 42    common_peg_parser operator+(const common_peg_parser & other) const;
 43
 44    // Creates a sequence separated by spaces.
 45    common_peg_parser operator<<(const common_peg_parser & other) const;
 46
 47    // Creates a choice
 48    common_peg_parser operator|(const common_peg_parser & other) const;
 49
 50    common_peg_parser operator+(const char * str) const;
 51    common_peg_parser operator+(const std::string & str) const;
 52    common_peg_parser operator<<(const char * str) const;
 53    common_peg_parser operator<<(const std::string & str) const;
 54    common_peg_parser operator|(const char * str) const;
 55    common_peg_parser operator|(const std::string & str) const;
 56};
 57
 58common_peg_parser operator+(const char * str, const common_peg_parser & p);
 59common_peg_parser operator+(const std::string & str, const common_peg_parser & p);
 60common_peg_parser operator<<(const char * str, const common_peg_parser & p);
 61common_peg_parser operator<<(const std::string & str, const common_peg_parser & p);
 62common_peg_parser operator|(const char * str, const common_peg_parser & p);
 63common_peg_parser operator|(const std::string & str, const common_peg_parser & p);
 64
 65enum common_peg_parse_result_type {
 66    COMMON_PEG_PARSE_RESULT_FAIL            = 0,
 67    COMMON_PEG_PARSE_RESULT_SUCCESS         = 1,
 68    COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT = 2,
 69};
 70
 71const char * common_peg_parse_result_type_name(common_peg_parse_result_type type);
 72
 73struct common_peg_ast_node {
 74    common_peg_ast_id id;
 75    std::string rule;
 76    std::string tag;
 77    size_t start;
 78    size_t end;
 79    std::string_view text;
 80    std::vector<common_peg_ast_id> children;
 81
 82    bool is_partial = false;
 83};
 84
 85struct common_peg_parse_result;
 86
 87using common_peg_ast_visitor = std::function<void(const common_peg_ast_node & node)>;
 88
 89class common_peg_ast_arena {
 90    std::vector<common_peg_ast_node> nodes_;
 91  public:
 92    common_peg_ast_id add_node(
 93        const std::string & rule,
 94        const std::string & tag,
 95        size_t start,
 96        size_t end,
 97        std::string_view text,
 98        std::vector<common_peg_ast_id> children,
 99        bool is_partial = false
100    ) {
101        common_peg_ast_id id = nodes_.size();
102        nodes_.push_back({id, rule, tag, start, end, text, std::move(children), is_partial});
103        return id;
104    }
105
106    const common_peg_ast_node & get(common_peg_ast_id id) const { return nodes_.at(id); }
107
108    size_t size() const { return nodes_.size(); }
109
110    void clear() { nodes_.clear(); }
111
112    void visit(common_peg_ast_id id, const common_peg_ast_visitor & visitor) const;
113    void visit(const common_peg_parse_result & result, const common_peg_ast_visitor & visitor) const;
114};
115
116struct common_peg_parse_result {
117    common_peg_parse_result_type type = COMMON_PEG_PARSE_RESULT_FAIL;
118    size_t start = 0;
119    size_t end = 0;
120
121    std::vector<common_peg_ast_id> nodes;
122
123    common_peg_parse_result() = default;
124
125    common_peg_parse_result(common_peg_parse_result_type type, size_t start)
126        : type(type), start(start), end(start) {}
127
128    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end)
129        : type(type), start(start), end(end) {}
130
131    common_peg_parse_result(common_peg_parse_result_type type, size_t start, size_t end, std::vector<common_peg_ast_id> nodes)
132        : type(type), start(start), end(end), nodes(std::move(nodes)) {}
133
134    bool fail() const { return type == COMMON_PEG_PARSE_RESULT_FAIL; }
135    bool need_more_input() const { return type == COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT; }
136    bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
137};
138
139struct common_peg_parse_context {
140    std::string input;
141    bool is_partial;
142    common_peg_ast_arena ast;
143
144    int parse_depth;
145
146    common_peg_parse_context()
147        : is_partial(false), parse_depth(0) {}
148
149    common_peg_parse_context(const std::string & input)
150        : input(input), is_partial(false), parse_depth(0) {}
151
152    common_peg_parse_context(const std::string & input, bool is_partial)
153        : input(input), is_partial(is_partial), parse_depth(0) {}
154};
155
156class common_peg_arena;
157
158// Parser variants
159struct common_peg_epsilon_parser {};
160
161struct common_peg_start_parser {};
162
163struct common_peg_end_parser {};
164
165struct common_peg_literal_parser {
166    std::string literal;
167};
168
169struct common_peg_sequence_parser {
170    std::vector<common_peg_parser_id> children;
171};
172
173struct common_peg_choice_parser {
174    std::vector<common_peg_parser_id> children;
175};
176
177struct common_peg_repetition_parser {
178    common_peg_parser_id child;
179    int min_count;
180    int max_count;  // -1 for unbounded
181};
182
183struct common_peg_and_parser {
184    common_peg_parser_id child;
185};
186
187struct common_peg_not_parser {
188    common_peg_parser_id child;
189};
190
191struct common_peg_any_parser {};
192
193struct common_peg_space_parser {};
194
195struct common_peg_chars_parser {
196    struct char_range {
197        uint32_t start;
198        uint32_t end;
199        bool contains(uint32_t codepoint) const { return codepoint >= start && codepoint <= end; }
200    };
201
202    std::string pattern;
203    std::vector<char_range> ranges;
204    bool negated;
205    int min_count;
206    int max_count;  // -1 for unbounded
207};
208
209struct common_peg_json_string_parser {};
210
211struct common_peg_until_parser {
212    std::vector<std::string> delimiters;
213};
214
215struct common_peg_schema_parser {
216    common_peg_parser_id child;
217    std::string name;
218    std::shared_ptr<nlohmann::ordered_json> schema;
219
220    // Indicates if the GBNF should accept a raw string that matches the schema.
221    bool raw;
222};
223
224struct common_peg_rule_parser {
225    std::string name;
226    common_peg_parser_id child;
227    bool trigger;
228};
229
230struct common_peg_ref_parser {
231    std::string name;
232};
233
234struct common_peg_atomic_parser {
235    common_peg_parser_id child;
236};
237
238struct common_peg_tag_parser {
239    common_peg_parser_id child;
240    std::string tag;
241};
242
243// Variant holding all parser types
244using common_peg_parser_variant = std::variant<
245    common_peg_epsilon_parser,
246    common_peg_start_parser,
247    common_peg_end_parser,
248    common_peg_literal_parser,
249    common_peg_sequence_parser,
250    common_peg_choice_parser,
251    common_peg_repetition_parser,
252    common_peg_and_parser,
253    common_peg_not_parser,
254    common_peg_any_parser,
255    common_peg_space_parser,
256    common_peg_chars_parser,
257    common_peg_json_string_parser,
258    common_peg_until_parser,
259    common_peg_schema_parser,
260    common_peg_rule_parser,
261    common_peg_ref_parser,
262    common_peg_atomic_parser,
263    common_peg_tag_parser
264>;
265
266class common_peg_arena {
267    std::vector<common_peg_parser_variant> parsers_;
268    std::unordered_map<std::string, common_peg_parser_id> rules_;
269    common_peg_parser_id root_ = COMMON_PEG_INVALID_PARSER_ID;
270
271  public:
272    const common_peg_parser_variant & get(common_peg_parser_id id) const { return parsers_.at(id); }
273    common_peg_parser_variant & get(common_peg_parser_id id) { return parsers_.at(id); }
274
275    size_t size() const { return parsers_.size(); }
276    bool empty() const { return parsers_.empty(); }
277
278    common_peg_parser_id get_rule(const std::string & name) const;
279    bool has_rule(const std::string & name) const { return rules_.find(name) != rules_.end(); }
280
281    common_peg_parser_id root() const { return root_; }
282    void set_root(common_peg_parser_id id) { root_ = id; }
283
284    common_peg_parse_result parse(common_peg_parse_context & ctx, size_t start = 0) const;
285    common_peg_parse_result parse(common_peg_parser_id id, common_peg_parse_context & ctx, size_t start) const;
286
287    void resolve_refs();
288
289    void build_grammar(const common_grammar_builder & builder, bool lazy = false) const;
290
291    std::string dump(common_peg_parser_id id) const;
292
293    nlohmann::json to_json() const;
294    static common_peg_arena from_json(const nlohmann::json & j);
295
296    std::string save() const;
297    void load(const std::string & data);
298
299    friend class common_peg_parser_builder;
300
301  private:
302    common_peg_parser_id add_parser(common_peg_parser_variant parser);
303    void add_rule(const std::string & name, common_peg_parser_id id);
304
305    common_peg_parser_id resolve_ref(common_peg_parser_id id);
306};
307
308class common_peg_parser_builder {
309    common_peg_arena arena_;
310
311    common_peg_parser wrap(common_peg_parser_id id) { return common_peg_parser(id, *this); }
312    common_peg_parser add(const common_peg_parser_variant & p) { return wrap(arena_.add_parser(p)); }
313
314  public:
315    common_peg_parser_builder();
316
317    // Match nothing, always succeed.
318    //   S -> ε
319    common_peg_parser eps() { return add(common_peg_epsilon_parser{}); }
320
321    // Matches the start of the input.
322    //   S -> ^
323    common_peg_parser start() { return add(common_peg_start_parser{}); }
324
325    // Matches the end of the input.
326    //   S -> $
327    common_peg_parser end() { return add(common_peg_end_parser{}); }
328
329    // Matches an exact literal string.
330    //   S -> "hello"
331    common_peg_parser literal(const std::string & literal) { return add(common_peg_literal_parser{literal}); }
332
333    // Matches a sequence of parsers in order, all must succeed.
334    //   S -> A B C
335    common_peg_parser sequence() { return add(common_peg_sequence_parser{}); }
336    common_peg_parser sequence(const std::vector<common_peg_parser_id> & parsers);
337    common_peg_parser sequence(const std::vector<common_peg_parser> & parsers);
338    common_peg_parser sequence(std::initializer_list<common_peg_parser> parsers);
339
340    // Matches the first parser that succeeds from a list of alternatives.
341    //   S -> A | B | C
342    common_peg_parser choice() { return add(common_peg_choice_parser{}); }
343    common_peg_parser choice(const std::vector<common_peg_parser_id> & parsers);
344    common_peg_parser choice(const std::vector<common_peg_parser> & parsers);
345    common_peg_parser choice(std::initializer_list<common_peg_parser> parsers);
346
347    // Matches one or more repetitions of a parser.
348    //   S -> A+
349    common_peg_parser one_or_more(const common_peg_parser & p) { return repeat(p, 1, -1); }
350
351    // Matches zero or more repetitions of a parser, always succeeds.
352    //   S -> A*
353    common_peg_parser zero_or_more(const common_peg_parser & p) { return repeat(p, 0, -1); }
354
355    // Matches zero or one occurrence of a parser, always succeeds.
356    //   S -> A?
357    common_peg_parser optional(const common_peg_parser & p) { return repeat(p, 0, 1); }
358
359    // Positive lookahead: succeeds if child parser succeeds, consumes no input.
360    //   S -> &A
361    common_peg_parser peek(const common_peg_parser & p) { return add(common_peg_and_parser{p}); }
362
363    // Negative lookahead: succeeds if child parser fails, consumes no input.
364    //   S -> !A
365    common_peg_parser negate(const common_peg_parser & p) { return add(common_peg_not_parser{p}); }
366
367    // Matches any single character.
368    //   S -> .
369    common_peg_parser any() { return add(common_peg_any_parser{}); }
370
371    // Matches between min and max repetitions of characters from a character class.
372    //   S -> [a-z]{m,n}
373    //
374    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
375    common_peg_parser chars(const std::string & classes, int min = 1, int max = -1);
376
377    // Creates a lightweight reference to a named rule (resolved during build()).
378    // Use this for forward references in recursive grammars.
379    //   expr_ref -> expr
380    common_peg_parser ref(const std::string & name) { return add(common_peg_ref_parser{name}); }
381
382    // Matches zero or more whitespace characters (space, tab, newline).
383    //   S -> [ \t\n]*
384    common_peg_parser space() { return add(common_peg_space_parser{}); }
385
386    // Matches all characters until a delimiter is found (delimiter not consumed).
387    //   S -> (!delim .)*
388    common_peg_parser until(const std::string & delimiter) { return add(common_peg_until_parser{{delimiter}}); }
389
390    // Matches all characters until one of the delimiters in the list is found (delimiter not consumed).
391    //   S -> (!delim .)*
392    common_peg_parser until_one_of(const std::vector<std::string> & delimiters) { return add(common_peg_until_parser{delimiters}); }
393
394    // Matches everything
395    //   S -> .*
396    common_peg_parser rest() { return until_one_of({}); }
397
398    // Matches between min and max repetitions of a parser (inclusive).
399    //   S -> A{m,n}
400    // Use -1 for max to represent unbounded repetition (equivalent to {m,})
401    common_peg_parser repeat(const common_peg_parser & p, int min, int max) { return add(common_peg_repetition_parser{p, min,max}); }
402
403    // Matches exactly n repetitions of a parser.
404    //   S -> A{n}
405    common_peg_parser repeat(const common_peg_parser & p, int n) { return repeat(p, n, n); }
406
407    // Creates a complete JSON parser supporting objects, arrays, strings, numbers, booleans, and null.
408    //   value -> object | array | string | number | true | false | null
409    common_peg_parser json();
410    common_peg_parser json_object();
411    common_peg_parser json_string();
412    common_peg_parser json_array();
413    common_peg_parser json_number();
414    common_peg_parser json_bool();
415    common_peg_parser json_null();
416
417    // Matches JSON string content without the surrounding quotes.
418    // Useful for extracting content within a JSON string.
419    common_peg_parser json_string_content();
420
421    // Matches a JSON object member with a key and associated parser as the
422    // value.
423    common_peg_parser json_member(const std::string & key, const common_peg_parser & p);
424
425    // Wraps a parser with JSON schema metadata for grammar generation.
426    // Used internally to convert JSON schemas to GBNF grammar rules.
427    common_peg_parser schema(const common_peg_parser & p, const std::string & name, const nlohmann::ordered_json & schema, bool raw = false);
428
429    // Creates a named rule, stores it in the grammar, and returns a ref.
430    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
431    //   auto json = p.rule("json", json_obj | json_arr | ...)
432    common_peg_parser rule(const std::string & name, const common_peg_parser & p, bool trigger = false);
433
434    // Creates a named rule using a builder function, and returns a ref.
435    // If trigger=true, marks this rule as an entry point for lazy grammar generation.
436    //   auto json = p.rule("json", [&]() { return json_object() | json_array() | ... })
437    common_peg_parser rule(const std::string & name, const std::function<common_peg_parser()> & builder, bool trigger = false);
438
439    // Creates a trigger rule. When generating a lazy grammar from the parser,
440    // only trigger rules and descendents are emitted.
441    common_peg_parser trigger_rule(const std::string & name, const common_peg_parser & p) { return rule(name, p, true); }
442    common_peg_parser trigger_rule(const std::string & name, const std::function<common_peg_parser()> & builder) { return rule(name, builder, true); }
443
444    // Creates an atomic parser. Atomic parsers do not create an AST node if
445    // the child results in a partial parse, i.e. NEEDS_MORE_INPUT. This is
446    // intended for situations where partial output is undesirable.
447    common_peg_parser atomic(const common_peg_parser & p) { return add(common_peg_atomic_parser{p}); }
448
449    // Tags create nodes in the generated AST for semantic purposes.
450    // Unlike rules, you can tag multiple nodes with the same tag.
451    common_peg_parser tag(const std::string & tag, const common_peg_parser & p) { return add(common_peg_tag_parser{p.id(), tag}); }
452
453    void set_root(const common_peg_parser & p);
454
455    common_peg_arena build();
456};
457
458// Helper function for building parsers
459common_peg_arena build_peg_parser(const std::function<common_peg_parser(common_peg_parser_builder & builder)> & fn);