1#ifdef NDEBUG
2#undef NDEBUG
3#endif
4
5#include "json-schema-to-grammar.h"
6
7#include "../src/unicode.h"
8#include "../src/llama-grammar.h"
9
10#include <nlohmann/json.hpp>
11
12#include <cassert>
13#include <string>
14#include <vector>
15
16using json = nlohmann::ordered_json;
17
18static llama_grammar * build_grammar(const std::string & grammar_str) {
19 return llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root", false, nullptr, 0, nullptr, 0);
20}
21
22static bool test_build_grammar_fails(const std::string & grammar_str) {
23 fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
24 bool grammar_fails = false;
25 llama_grammar * grammar = build_grammar(grammar_str);
26 if (grammar != nullptr) {
27 fprintf(stderr, " ❌ Expected build failure, but succeeded\n");
28 } else {
29 grammar_fails = true;
30 fprintf(stdout, " ✅︎\n");
31 }
32 return grammar_fails;
33}
34
35struct token_and_piece {
36 llama_token token;
37 std::string piece;
38};
39
40// token() encodes a 32-bit ID as 5 bytes: a 0xff marker followed by the ID in big-endian order.
41static std::string token(llama_token id) {
42 return std::string{
43 static_cast<char>(0xff),
44 static_cast<char>((id >> 24) & 0xff),
45 static_cast<char>((id >> 16) & 0xff),
46 static_cast<char>((id >> 8) & 0xff),
47 static_cast<char>(id & 0xff)
48 };
49}
50
51// parse_tokens() parses the token encodes above and UTF-8 text.
52static std::vector<token_and_piece> parse_tokens(const std::string & input) {
53 std::vector<token_and_piece> result;
54 result.reserve(input.size());
55 size_t offset = 0;
56 while (offset < input.size()) {
57 try {
58 if (static_cast<unsigned char>(input[offset]) == 0xff) {
59 if (offset + 5 > input.size()) {
60 throw std::runtime_error("not enough bytes for token id");
61 }
62 uint32_t val =
63 (static_cast<unsigned char>(input[offset + 1]) << 24) |
64 (static_cast<unsigned char>(input[offset + 2]) << 16) |
65 (static_cast<unsigned char>(input[offset + 3]) << 8) |
66 (static_cast<unsigned char>(input[offset + 4]));
67 auto piece = "<[" + std::to_string(val) + "]>";
68 result.push_back({static_cast<llama_token>(val), piece});
69 offset += 5;
70 } else {
71 uint32_t cpt = unicode_cpt_from_utf8(input, offset);
72 result.push_back({0, unicode_cpt_to_utf8(cpt)});
73 }
74 } catch (const std::invalid_argument & /*ex*/) {
75 // Silently ignore invalid UTF-8 input to avoid leaking the exception beyond llama_tokenize
76 ++offset;
77 result.push_back({0, unicode_cpt_to_utf8(0xFFFD)}); // replacement character
78 }
79 }
80 return result;
81}
82
83static bool match_string(const std::string & input, llama_grammar * grammar) {
84 const auto parsed = parse_tokens(input);
85
86 auto & stacks_cur = llama_grammar_get_stacks(grammar);
87
88 for (const auto & in : parsed) {
89 try {
90 llama_grammar_accept_token(*grammar, in.token, in.piece);
91 } catch (const std::runtime_error & /*e*/) {
92 // normally this shouldn't get hit because of llama_grammar_apply
93 return false;
94 }
95
96 if (stacks_cur.empty()) {
97 // no stacks means that the grammar failed to match at this point
98 return false;
99 }
100 }
101
102 for (const auto & stack : stacks_cur) {
103 if (stack.empty()) {
104 // An empty stack means that the grammar has been completed
105 return true;
106 }
107 }
108
109 return false;
110}
111
112static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
113 fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
114 fflush(stderr);
115
116 auto * grammar = build_grammar(grammar_str);
117
118 // Save the original grammar stacks so that we can reset after every new string we want to test
119 const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy
120
121 llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar);
122
123 fprintf(stderr, " 🔵 Valid strings:\n");
124
125 // Passing strings
126 for (const auto & test_string : passing_strings) {
127 fprintf(stderr, " \"%s\" ", test_string.c_str());
128 fflush(stderr);
129
130 bool matched = match_string(test_string, grammar);
131
132 if (!matched) {
133 fprintf(stderr, "❌ (failed to match)\n");
134
135 // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
136 // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
137 FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
138 if (grammar_file) {
139 fprintf(grammar_file, "%s", grammar_str.c_str());
140 fclose(grammar_file);
141 }
142
143 // DEBUG: Write the test string to test-grammar-integration.string.txt
144 FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
145 if (string_file) {
146 fprintf(string_file, "%s", test_string.c_str());
147 fclose(string_file);
148 }
149
150 fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command: ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
151 } else {
152 fprintf(stdout, "✅︎\n");
153 }
154
155 assert(matched);
156
157 // Reset the grammar stacks
158 stacks_cur = stacks_org;
159 }
160
161 fprintf(stderr, " 🟠 Invalid strings:\n");
162
163 // Failing strings
164 for (const auto & test_string : failing_strings) {
165 fprintf(stderr, " \"%s\" ", test_string.c_str());
166 fflush(stderr);
167
168 bool matched = match_string(test_string, grammar);
169
170 if (matched) {
171 fprintf(stderr, "❌ (incorrectly matched)\n");
172 } else {
173 fprintf(stdout, "✅︎\n");
174 }
175 assert(!matched);
176
177 // Reset the grammar stacks
178 stacks_cur = stacks_org;
179 }
180
181 // Clean up allocated memory
182 llama_grammar_free_impl(grammar);
183}
184static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
185 test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
186}
187static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
188 test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str), true), passing_strings, failing_strings);
189}
190
191static void test_simple_grammar() {
192 test_schema(
193 "min 0",
194 R"""({
195 "type": "integer",
196 "minimum": 0
197 })""",
198 // Passing strings
199 {
200 "0",
201 "10",
202 "12",
203 "10000",
204 },
205 // Failing strings
206 {
207 "-1",
208 "-10",
209 "-10000",
210 "-100000000000000000000000000000000",
211 "100000000000000000000000000000000",
212 "00",
213 "01",
214 "-0",
215 }
216 );
217 test_schema(
218 "min 2",
219 // Schema
220 R"""({
221 "type": "integer",
222 "minimum": 2
223 })""",
224 // Passing strings
225 {
226 "2",
227 "3",
228 "4",
229 "10",
230 "20",
231 "1234567890000000",
232 },
233 // Failing strings
234 {
235 "0",
236 "1",
237 "-1",
238 "-100",
239 "0",
240 "1",
241 "01",
242 "02",
243 "12345678900000000",
244 }
245 );
246 test_schema(
247 "min 456",
248 R"""({
249 "type": "integer",
250 "minimum": 456
251 })""",
252 // Passing strings
253 {
254 "456",
255 "4560",
256 "457",
257 "460",
258 "500",
259 },
260 // Failing strings
261 {
262 "455",
263 "356",
264 "50",
265 "050",
266 "-1",
267 "-456",
268 }
269 );
270 test_schema(
271 "min -123",
272 R"""({
273 "type": "integer",
274 "minimum": -123
275 })""",
276 // Passing strings
277 {
278 "-123",
279 "-122",
280 "-11",
281 "-1",
282 "0",
283 "1",
284 "123",
285 "1234",
286 "2345",
287 },
288 // Failing strings
289 {
290 "-1234",
291 "-124",
292 }
293 );
294
295 test_schema(
296 "max 9999",
297 // Schema
298 R"""({
299 "type": "integer",
300 "maximum": 9999
301 })""",
302 // Passing strings
303 {
304 "-99999",
305 "0",
306 "9999",
307 },
308 // Failing strings
309 {
310 "10000",
311 "99991",
312 }
313 );
314 test_schema(
315 "max -9999",
316 // Schema
317 R"""({
318 "type": "integer",
319 "maximum": -9999
320 })""",
321 // Passing strings
322 {
323 "-10000",
324 "-9999",
325 },
326 // Failing strings
327 {
328 "-9998",
329 "0",
330 "9999",
331 }
332 );
333 test_schema(
334 "min 5 max 30",
335 // Schema
336 R"""({
337 "type": "integer",
338 "minimum": 5,
339 "maximum": 30
340 })""",
341 // Passing strings
342 {
343 "5",
344 "10",
345 "30",
346 },
347 // Failing strings
348 {
349 "05",
350 "4",
351 "-1",
352 "31",
353 "123",
354 "0123",
355 }
356 );
357 test_schema(
358 "min 1 max 900719925474091",
359 // Schema
360 R"""({
361 "type": "integer",
362 "exclusiveMinimum": 0,
363 "maximum": 900719925474091
364 })""",
365 // Passing strings
366 {
367 "1",
368 "2",
369 "10",
370 "900719925474090",
371 "900719925474091",
372 },
373 // Failing strings
374 {
375 "0",
376 "01",
377 "900719925474092",
378 "9007199254740910",
379 }
380 );
381 test_schema(
382 "min -1 max 1",
383 R"""({
384 "type": "integer",
385 "minimum": -1,
386 "maximum": 1
387 })""",
388 // Passing strings
389 {
390 "-1",
391 "0",
392 "1",
393 },
394 // Failing strings
395 {
396 "-11",
397 "-10",
398 "-2",
399 "2",
400 "10",
401 "11",
402 }
403 );
404 test_schema(
405 "min -123 max 42",
406 R"""({
407 "type": "integer",
408 "minimum": -123,
409 "maximum": 42
410 })""",
411 // Passing strings
412 {
413 "-123",
414 "-122",
415 "-13",
416 "-11",
417 "-2",
418 "-1",
419 "0",
420 "1",
421 "5",
422 "10",
423 "39",
424 "40",
425 "42",
426 },
427 // Failing strings
428 {
429 "-0123",
430 "-124",
431 "-1123",
432 "-200",
433 "43",
434 "123",
435 "0123",
436 }
437 );
438 test_schema(
439 "exclusive min / max",
440 // Schema
441 R"""({
442 "type": "integer",
443 "exclusiveMinimum": 0,
444 "exclusiveMaximum": 10000
445 })""",
446 // Passing strings
447 {
448 "1",
449 "9999",
450 },
451 // Failing strings
452 {
453 "0",
454 "01",
455 "10000",
456 "99999",
457 }
458 );
459
460 // Test case for a simple grammar
461 test_grammar(
462 "simple grammar",
463 R"""(
464 root ::= expr
465 expr ::= term ("+" term)*
466 term ::= number
467 number ::= [0-9]+)""",
468 // Passing strings
469 {
470 "42",
471 "1+2+3+4+5",
472 "123+456",
473 },
474 // Failing strings
475 {
476 "+",
477 "/ 3",
478 "1+2+3+4+5+",
479 "12a45",
480 }
481 );
482
483 // Test case for a simple grammar with tokens
484 test_grammar(
485 "simple grammar with tokens",
486 R"""(
487 root ::= <[10]> content <[11]>
488 content ::= (!<[11]>)*)""",
489 // Passing strings
490 {
491 token(10) + "hello world" + token(11),
492 token(10) + "text with " + token(12) + " other tokens " + token(13) + " mixed in" + token(11),
493 token(10) + token(11),
494 token(10) + token(12) + token(13) + token(14) + token(15) + token(11),
495 token(10) + "a" + token(11),
496 },
497 // Failing strings
498 {
499 token(10) + "missing end token",
500 token(10),
501 "missing start token" + token(11),
502 token(10) + token(11) + token(11), // double end token
503 token(11) + "wrong order" + token(10),
504 }
505 );
506}
507
508static void test_complex_grammar() {
509 // Test case for a more complex grammar, with both failure strings and success strings
510 test_grammar(
511 "medium complexity grammar",
512 // Grammar
513 R"""(
514 root ::= expression
515 expression ::= term ws (("+"|"-") ws term)*
516 term ::= factor ws (("*"|"/") ws factor)*
517 factor ::= number | variable | "(" expression ")" | function-call
518 number ::= [0-9]+
519 variable ::= [a-zA-Z_][a-zA-Z0-9_]*
520 function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
521 ws ::= [ \t\n\r]?)""",
522 // Passing strings
523 {
524 "42",
525 "1*2*3*4*5",
526 "x",
527 "x+10",
528 "x1+y2",
529 "(a+b)*(c-d)",
530 "func()",
531 "func(x,y+2)",
532 "a*(b+c)-d/e",
533 "f(g(x),h(y,z))",
534 "x + 10",
535 "x1 + y2",
536 "(a + b) * (c - d)",
537 "func()",
538 "func(x, y + 2)",
539 "a * (b + c) - d / e",
540 "f(g(x), h(y, z))",
541 "123+456",
542 "123*456*789-123/456+789*123",
543 "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
544 },
545 // Failing strings
546 {
547 "+",
548 "/ 3x",
549 "x + + y",
550 "a * / b",
551 "func(,)",
552 "func(x y)",
553 "(a + b",
554 "x + y)",
555 "a + b * (c - d",
556 "42 +",
557 "x +",
558 "x + 10 +",
559 "(a + b) * (c - d",
560 "func(",
561 "func(x, y + 2",
562 "a * (b + c) - d /",
563 "f(g(x), h(y, z)",
564 "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
565 }
566 );
567
568 // Test case for a more complex grammar with tokens
569 test_grammar(
570 "complex grammar with tokens",
571 R"""(
572 root ::= reasoning+ content tool-call*
573 reasoning ::= <[10]> (!<[11]>)* <[11]>
574 content ::= <[20]> (!<[21]>)* <[21]>
575 tool-call ::= <[12]> name <[13]> args <[14]>
576 name ::= (!<[13]>)+
577 args ::= (!<[14]>)*)""",
578 // Passing strings
579 {
580 token(10) + "I am thinking" + token(11) + token(20) + "hello world!" + token(21) + token(12) + "search" + token(13) + "query=test" + token(14),
581 token(10) + "reasoning 1" + token(11) + token(10) + "reasoning 2" + token(11) + token(20) + token(21) + token(12) + "tool" + token(13) + token(14),
582 token(10) + token(11) + token(20) + "content" + token(21),
583 token(10) + "think" + token(12) + " nested" + token(11) + token(20) + token(10) + "more content" + token(21) + token(12) + "fn" + token(13) + "x=1,y=2" + token(14) + token(12) + "fn2" + token(13) + token(14),
584 token(10) + "reasoning" + token(11) + token(10) + "more" + token(11) + token(10) + "even more" + token(11) + token(20) + "text" + token(21) + token(12) + "a" + token(13) + "b" + token(14) + token(12) + "c" + token(13) + "d" + token(14),
585 },
586 // Failing strings
587 {
588 token(20) + "content only" + token(21),
589 token(10) + "no closing reasoning",
590 token(10) + token(11) + token(20) + "no closing content",
591 token(10) + token(11) + token(20) + token(21) + token(12) + "incomplete tool",
592 token(10) + token(11) + token(11) + token(20) + token(21),
593 }
594 );
595}
596
597static void test_special_chars() {
598 // A collection of tests to exercise special characters such as "."
599 test_grammar(
600 "special characters",
601 // Grammar
602 R"""(
603 root ::= ... "abc" ...
604 )""",
605 // Passing strings
606 {
607 "abcabcabc",
608 "aaaabcccc",
609 // NOTE: Also ensures that multi-byte characters still count as a single character
610 "🔵🟠✅abc❌🟠🔵"
611 },
612 // Failing strings
613 {
614 "aaabcccc",
615 "aaaaabcccc",
616 "aaaabccc",
617 "aaaabccccc",
618 "🔵🟠✅❌abc❌✅🟠🔵",
619 "🔵🟠abc🟠🔵"
620 }
621 );
622}
623
624static void test_quantifiers() {
625 // A collection of tests to exercise * + and ? quantifiers
626
627 test_grammar(
628 "* quantifier",
629 // Grammar
630 R"""(root ::= "a"*)""",
631 // Passing strings
632 {
633 "",
634 "a",
635 "aaaaa",
636 "aaaaaaaaaaaaaaaaaa",
637 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
638 },
639 // Failing strings
640 {
641 "b",
642 "ab",
643 "aab",
644 "ba",
645 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
646 }
647 );
648 test_grammar(
649 "+ quantifier",
650 // Grammar
651 R"""(root ::= "a"+)""",
652 // Passing strings
653 {
654 "a",
655 "aaaaa",
656 "aaaaaaaaaaaaaaaaaa",
657 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
658 },
659 // Failing strings
660 {
661 "",
662 "b",
663 "ab",
664 "aab",
665 "ba",
666 "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
667 }
668 );
669 test_grammar(
670 "? quantifier",
671 // Grammar
672 R"""(root ::= "a"?)""",
673 // Passing strings
674 {
675 "",
676 "a"
677 },
678 // Failing strings
679 {
680 "b",
681 "ab",
682 "aa",
683 "ba",
684 }
685 );
686 test_grammar(
687 "mixed quantifiers",
688 // Grammar
689 R"""(
690 root ::= cons+ vowel* cons? (vowel cons)*
691 vowel ::= [aeiouy]
692 cons ::= [bcdfghjklmnpqrstvwxyz]
693 )""",
694 // Passing strings
695 {
696 "yes",
697 "no",
698 "noyes",
699 "crwth",
700 "four",
701 "bryyyy",
702 },
703 // Failing strings
704 {
705 "yess",
706 "yesno",
707 "forty",
708 "catyyy",
709 }
710 );
711 test_grammar(
712 "simple exact repetition",
713 // Grammar
714 R"""(
715 root ::= [ab]{4}
716 )""",
717 // Passing strings
718 {
719 "aaaa",
720 "bbbb",
721 "abab",
722 },
723 // Failing strings
724 {
725 "a",
726 "b",
727 "aaaaa",
728 }
729 );
730 test_grammar(
731 "simple min repetition",
732 // Grammar
733 R"""(
734 root ::= [ab]{4,}
735 )""",
736 // Passing strings
737 {
738 "aaaa",
739 "aaaaab",
740 "bbbb",
741 "ababab",
742 },
743 // Failing strings
744 {
745 "",
746 "aba",
747 }
748 );
749 test_grammar(
750 "simple max repetition",
751 // Grammar
752 R"""(
753 root ::= [ab]{0,4}
754 )""",
755 // Passing strings
756 {
757 "",
758 "a",
759 "aa",
760 "aaa",
761 "aaab",
762 },
763 // Failing strings
764 {
765 "aaaaa",
766 }
767 );
768 test_grammar(
769 "min / max repetition",
770 // Grammar
771 R"""(
772 root ::= ("0x" [A-F0-9]{2} " "?){3,5}
773 )""",
774 // Passing strings
775 {
776 "0xFF 0x12 0xAB",
777 "0xFF 0x12 0xAB 0x00 0x00",
778 },
779 // Failing strings
780 {
781 "",
782 "0xFF",
783 "0xFF 0x12",
784 "0xFF 0x12 0xAB 0x00 0x00 0x00",
785 }
786 );
787}
788
789static void test_failure_missing_root() {
790 fprintf(stderr, "⚫ Testing missing root node:\n");
791 // Test case for a grammar that is missing a root rule
792 const std::string grammar_str = R"""(
793 rot ::= expr
794 expr ::= term ("+" term)*
795 term ::= number
796 number ::= [0-9]+)""";
797
798 llama_grammar_parser parsed_grammar;
799 parsed_grammar.parse(grammar_str.c_str());
800
801 // Ensure we parsed correctly
802 assert(!parsed_grammar.rules.empty());
803
804 // Ensure we do NOT have a root node
805 assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
806 fprintf(stderr, " ✅︎ Passed\n");
807}
808
809static void test_failure_missing_reference() {
810 fprintf(stderr, "⚫ Testing missing reference node:\n");
811
812 // Test case for a grammar that is missing a referenced rule
813 const std::string grammar_str =
814 R"""(root ::= expr
815 expr ::= term ("+" term)*
816 term ::= numero
817 number ::= [0-9]+)""";
818
819 fprintf(stderr, " Expected error: ");
820
821 llama_grammar_parser parsed_grammar;
822 parsed_grammar.parse(grammar_str.c_str());
823
824 // Ensure we did NOT parsed correctly
825 assert(parsed_grammar.rules.empty());
826
827 fprintf(stderr, " End of expected error.\n");
828 fprintf(stderr, " ✅︎ Passed\n");
829}
830
831static void test_failure_left_recursion() {
832 fprintf(stderr, "⚫ Testing left recursion detection:\n");
833
834 // Test simple left recursion detection
835 const std::string simple_str = R"""(root ::= "a" | root "a")""";
836 assert(test_build_grammar_fails(simple_str));
837
838 // Test more complicated left recursion detection
839 const std::string medium_str = R"""(
840 root ::= asdf
841 asdf ::= "a" | asdf "a"
842 )""";
843 assert(test_build_grammar_fails(medium_str));
844
845 // Test even more complicated left recursion detection
846 const std::string hard_str = R"""(
847 root ::= asdf
848 asdf ::= "a" | foo "b"
849 foo ::= "c" | asdf "d" | "e")""";
850 assert(test_build_grammar_fails(hard_str));
851
852 // Test yet even more complicated left recursion detection
853 const std::string hardest_str = R"""(
854 root ::= asdf
855 asdf ::= "a" | foo "b"
856 foo ::= "c" | empty asdf "d" | "e"
857 empty ::= "blah" | )""";
858 assert(test_build_grammar_fails(hardest_str));
859
860 fprintf(stderr, " ✅︎ Passed\n");
861}
862
863static void test_json_schema() {
864 // Note that this is similar to the regular grammar tests,
865 // but we convert each json schema to a grammar before parsing.
866 // Otherwise, this test structure is the same.
867
868 test_schema(
869 "empty schema (object)",
870 // Schema
871 R"""(
872 {}
873 )""",
874 // Passing strings
875 {
876 R"""({})""",
877 R"""({"foo": "bar"})""",
878 },
879 // Failing strings
880 {
881 "",
882 "[]",
883 "null",
884 R"""("")""",
885 "true",
886 }
887 );
888
889 test_schema(
890 "exotic formats (list)",
891 // Schema
892 R"""({
893 "items": [
894 { "format": "date" },
895 { "format": "uuid" },
896 { "format": "time" },
897 { "format": "date-time" }
898 ]
899 })""",
900 // Passing strings
901 {
902 // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
903 // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
904 R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
905 //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
906 //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
907 },
908 // Failing strings
909 {
910 R"""(["foo", "bar"])""",
911 R"""(["12345678-1234-1234-1234-1234567890ab"])""",
912 }
913 );
914
915 test_schema(
916 "string",
917 // Schema
918 R"""({
919 "type": "string"
920 })""",
921 // Passing strings
922 {
923 R"""("foo")""",
924 R"""("bar")""",
925 R"""("")""",
926 },
927 // Failing strings
928 {
929 R"""({})""",
930 R"""("foo": "bar")""",
931 }
932 );
933
934 test_schema(
935 "string w/ min length 1",
936 // Schema
937 R"""({
938 "type": "string",
939 "minLength": 1
940 })""",
941 // Passing strings
942 {
943 R"""("foo")""",
944 R"""("bar")""",
945 },
946 // Failing strings
947 {
948 R"""("")""",
949 R"""({})""",
950 R"""("foo": "bar")""",
951 }
952 );
953
954 test_schema(
955 "string w/ min length 3",
956 // Schema
957 R"""({
958 "type": "string",
959 "minLength": 3
960 })""",
961 // Passing strings
962 {
963 R"""("foo")""",
964 R"""("bar")""",
965 R"""("foobar")""",
966 },
967 // Failing strings
968 {
969 R"""("")""",
970 R"""("f")""",
971 R"""("fo")""",
972 }
973 );
974
975 test_schema(
976 "string w/ max length",
977 // Schema
978 R"""({
979 "type": "string",
980 "maxLength": 3
981 })""",
982 // Passing strings
983 {
984 R"""("foo")""",
985 R"""("bar")""",
986 R"""("")""",
987 R"""("f")""",
988 R"""("fo")""",
989 },
990 // Failing strings
991 {
992 R"""("foobar")""",
993 }
994 );
995
996 test_schema(
997 "string w/ min & max length",
998 // Schema
999 R"""({
1000 "type": "string",
1001 "minLength": 1,
1002 "maxLength": 4
1003 })""",
1004 // Passing strings
1005 {
1006 R"""("foo")""",
1007 R"""("bar")""",
1008 R"""("f")""",
1009 R"""("barf")""",
1010 },
1011 // Failing strings
1012 {
1013 R"""("")""",
1014 R"""("barfo")""",
1015 R"""("foobar")""",
1016 }
1017 );
1018
1019 test_schema(
1020 "boolean",
1021 // Schema
1022 R"""({
1023 "type": "boolean"
1024 })""",
1025 // Passing strings
1026 {
1027 "true",
1028 "false",
1029 },
1030 // Failing strings
1031 {
1032 R"""("")""",
1033 R"""("true")""",
1034 R"""(True)""",
1035 R"""(FALSE)""",
1036 }
1037 );
1038
1039 test_schema(
1040 "integer",
1041 // Schema
1042 R"""({
1043 "type": "integer"
1044 })""",
1045 // Passing strings
1046 {
1047 R"""(0)""",
1048 R"""(12345)""",
1049 R"""(1234567890123456)""",
1050 },
1051 // Failing strings
1052 {
1053 R"""()""",
1054 R"""(01)""",
1055 R"""(007)""",
1056 R"""(12345678901234567 )""",
1057 }
1058 );
1059
1060 test_schema(
1061 "string const",
1062 // Schema
1063 R"""({
1064 "const": "foo"
1065 })""",
1066 // Passing strings
1067 {
1068 R"""("foo")""",
1069 },
1070 // Failing strings
1071 {
1072 R"""(foo)""",
1073 R"""("bar")""",
1074 }
1075 );
1076
1077 test_schema(
1078 "non-string const",
1079 // Schema
1080 R"""({
1081 "const": true
1082 })""",
1083 // Passing strings
1084 {
1085 R"""(true)""",
1086 },
1087 // Failing strings
1088 {
1089 R"""()""",
1090 R"""(foo)""",
1091 R"""("true")""",
1092 }
1093 );
1094
1095 test_schema(
1096 "non-string const",
1097 // Schema
1098 R"""({
1099 "enum": ["red", "amber", "green", null, 42, ["foo"]]
1100 })""",
1101 // Passing strings
1102 {
1103 R"""("red")""",
1104 R"""(null)""",
1105 R"""(42)""",
1106 R"""(["foo"])""",
1107 },
1108 // Failing strings
1109 {
1110 R"""()""",
1111 R"""(420)""",
1112 R"""(true)""",
1113 R"""(foo)""",
1114 }
1115 );
1116
1117 test_schema(
1118 "simple pattern",
1119 // Schema
1120 R"""({
1121 "pattern": "^[a-zA-Z0-9_-]*$"
1122 })""",
1123 // Passing strings
1124 {
1125 R"""("")""",
1126 R"""("He_llo-12")""",
1127 },
1128 // Failing strings
1129 {
1130 R"""("!")""",
1131 R"""("Hello World")""",
1132 }
1133 );
1134
1135 test_schema(
1136 "pattern with escapes",
1137 // Schema
1138 R"""({
1139 "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
1140 })""",
1141 // Passing strings
1142 {
1143 R"""("a^$.[]()|{}*+?b")""",
1144 },
1145 // Failing strings
1146 {
1147 R"""("ab")""",
1148 }
1149 );
1150
1151 test_schema(
1152 "",
1153 // Schema
1154 R"""(
1155 {
1156 "type": ["array", "null"],
1157 "items": { "type": "string" }
1158 }
1159 )""",
1160 // Passing strings
1161 {
1162 "null",
1163 "[]",
1164 "[\"123\"]",
1165 "[\"foo\", \"bar\"]",
1166 },
1167 // Failing strings
1168 {
1169 "",
1170 "[123]",
1171 "\"foo\"",
1172 "[\"foo\", 42]",
1173 }
1174 );
1175
1176 test_schema(
1177 "min+max items",
1178 // Schema
1179 R"""({
1180 "items": {
1181 "type": ["number", "integer"]
1182 },
1183 "minItems": 3,
1184 "maxItems": 5
1185 })""",
1186 // Passing strings
1187 {
1188 R"""([1, 2, 3])""",
1189 R"""([1, 2, 3, 4])""",
1190 R"""([1, 2, 3, 4, 5])""",
1191 },
1192 // Failing strings
1193 {
1194 R"""([1, 2])""",
1195 R"""([1, 2, 3, 4, 5, 6])""",
1196 R"""(1)""",
1197 }
1198 );
1199
1200 // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
1201 test_schema(
1202 "object properties",
1203 // Schema
1204 R"""({
1205 "type": "object",
1206 "properties": {
1207 "number": { "type": "number" },
1208 "street_name": { "type": "string" },
1209 "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
1210 }
1211 })""",
1212 // Passing strings
1213 {
1214 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
1215 // "By default, leaving out properties is valid"
1216 R"""({ "street_name": "Pennsylvania" })""",
1217 R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
1218 // "By extension, even an empty object is valid"
1219 R"""({})""",
1220 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
1221 },
1222 // Failing strings
1223 {
1224 // Change datatype from number to string
1225 R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
1226 // Reorder properties
1227 R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
1228 // Reorder properties
1229 R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
1230 // "Additional properties default to false for generation, even though the spec says true.
1231 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
1232
1233 }
1234 );
1235
1236 test_schema(
1237 "additional properties can't override other properties",
1238 R"""({
1239 "properties": {
1240 "a": {"type": "integer"},
1241 "b": {"type": "integer"}
1242 },
1243 "additionalProperties": true
1244 })""",
1245 // Passing strings
1246 {
1247 R"""({"a": 42})""",
1248 R"""({"c": ""})""",
1249 R"""({"a": 42, "c": ""})""",
1250 R"""({"a_": ""})""",
1251 },
1252 // Failing strings
1253 {
1254 R"""()""",
1255 R"""({"a": ""})""",
1256 R"""({"a": "", "b": ""})""",
1257 }
1258 );
1259
1260 // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
1261 test_schema(
1262 "object properties, additionalProperties: true",
1263 // Schema
1264 R"""({
1265 "type": "object",
1266 "properties": {
1267 "number": { "type": "number" },
1268 "street_name": { "type": "string" },
1269 "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
1270 },
1271 "additionalProperties": true
1272 })""",
1273 // Passing strings
1274 {
1275 // "By extension, even an empty object is valid"
1276 R"""({})""",
1277 R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
1278 // "By default, leaving out properties is valid"
1279 R"""({ "street_name": "Pennsylvania" })""",
1280 R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
1281 // "By default, providing additional properties is valid"
1282 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
1283 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
1284 },
1285 // Failing strings
1286 {
1287 // Change datatype from number to string
1288 R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
1289 // Reorder properties
1290 R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
1291 }
1292 );
1293
1294 // Additional properties: false
1295 test_schema(
1296 "required + optional props each in original order",
1297 // Schema
1298 R"""({
1299 "type": "object",
1300 "properties": {
1301 "number": { "type": "number" },
1302 "street_name": { "type": "string" },
1303 "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
1304 },
1305 "additionalProperties": false
1306 })""",
1307 // Passing strings
1308 {
1309 R"""({ "street_name": "Pennsylvania" })""",
1310 R"""({ "number": 1600, "street_type":"Avenue"})""",
1311 R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
1312 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
1313 // Spaces are permitted around enum values
1314 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
1315 },
1316 // Failing strings
1317 {
1318 // Reorder properties
1319 R"""({ "street_type": "Avenue", "number": 1600 })""",
1320 // Add "direction"
1321 R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
1322 }
1323 );
1324
1325 test_schema(
1326 "required + optional props each in original order",
1327 // Schema
1328 R"""({
1329 "properties": {
1330 "b": {"type": "string"},
1331 "a": {"type": "string"},
1332 "d": {"type": "string"},
1333 "c": {"type": "string"}
1334 },
1335 "required": ["a", "b"],
1336 "additionalProperties": false
1337 })""",
1338 // Passing strings
1339 {
1340 R"""({"b": "foo", "a": "bar"})""",
1341 R"""({"b":"foo","a":"bar","d":"qux"})""",
1342 R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
1343 },
1344 // Failing strings
1345 {
1346 R"""({"a": "foo", "b": "bar"})""",
1347 R"""({"b": "bar"})""",
1348 R"""({"a": "foo", "c": "baz"})""",
1349 R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
1350 }
1351 );
1352
1353 // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
1354 test_schema(
1355 "required props",
1356 // Schema
1357 R"""({
1358 "$schema": "https://json-schema.org/draft/2020-12/schema",
1359 "$id": "https://example.com/product.schema.json",
1360 "title": "Product",
1361 "description": "A product from Acme's catalog",
1362 "type": "object",
1363 "properties": {
1364 "productId": {
1365 "description": "The unique identifier for a product",
1366 "type": "integer"
1367 },
1368 "productName": {
1369 "description": "Name of the product",
1370 "type": "string"
1371 },
1372 "price": {
1373 "description": "The price of the product",
1374 "type": "number",
1375 "exclusiveMinimum": 0
1376 },
1377 "tags": {
1378 "description": "Tags for the product",
1379 "type": "array",
1380 "items": {
1381 "type": "string"
1382 },
1383 "minItems": 1,
1384 "uniqueItems": true
1385 },
1386 "dimensions": {
1387 "type": "object",
1388 "properties": {
1389 "length": {
1390 "type": "number"
1391 },
1392 "width": {
1393 "type": "number"
1394 },
1395 "height": {
1396 "type": "number"
1397 }
1398 },
1399 "required": [ "length", "width", "height" ]
1400 }
1401 },
1402 "required": [ "productId", "productName", "price" ]
1403 })""",
1404 // Passing strings
1405 {
1406 R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
1407 R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
1408 R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
1409 },
1410 // Failing strings
1411 {
1412 R"""({})""", // Missing all required properties
1413 R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
1414 // TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
1415 // Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
1416 // R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
1417 R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
1418 R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
1419 R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
1420 R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
1421 // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
1422 // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
1423 }
1424 );
1425}
1426
1427int main() {
1428 fprintf(stdout, "Running grammar integration tests...\n");
1429 test_simple_grammar();
1430 test_complex_grammar();
1431 test_special_chars();
1432 test_quantifiers();
1433 test_failure_missing_root();
1434 test_failure_missing_reference();
1435 test_failure_left_recursion();
1436 test_json_schema();
1437 fprintf(stdout, "All tests passed.\n");
1438 return 0;
1439}