diff options
Diffstat (limited to 'llama.cpp/tests/peg-parser')
| -rw-r--r-- | llama.cpp/tests/peg-parser/simple-tokenize.cpp | 37 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/simple-tokenize.h | 6 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/test-basic.cpp | 454 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/test-gbnf-generation.cpp | 250 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/test-json-parser.cpp | 109 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/test-json-serialization.cpp | 28 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/test-unicode.cpp | 449 | ||||
| -rw-r--r-- | llama.cpp/tests/peg-parser/tests.h | 24 |
8 files changed, 1357 insertions, 0 deletions
diff --git a/llama.cpp/tests/peg-parser/simple-tokenize.cpp b/llama.cpp/tests/peg-parser/simple-tokenize.cpp new file mode 100644 index 0000000..9abfa04 --- /dev/null +++ b/llama.cpp/tests/peg-parser/simple-tokenize.cpp @@ -0,0 +1,37 @@ +#include "simple-tokenize.h" + +std::vector<std::string> simple_tokenize(const std::string & input) { + std::vector<std::string> result; + std::string current; + + for (size_t i = 0; i < input.size(); i++) { + switch (input[i]) { + case ' ': + case '\n': + case '\t': + case '{': + case '}': + case ',': + case '[': + case '"': + case ']': + case '.': + case '<': + case '>': + case '=': + case '/': + if (!current.empty()) { + result.push_back(current); + current.clear(); + } + default:; + } + current += input[i]; + } + + if (!current.empty()) { + result.push_back(current); + } + + return result; +} diff --git a/llama.cpp/tests/peg-parser/simple-tokenize.h b/llama.cpp/tests/peg-parser/simple-tokenize.h new file mode 100644 index 0000000..1772432 --- /dev/null +++ b/llama.cpp/tests/peg-parser/simple-tokenize.h @@ -0,0 +1,6 @@ +#pragma once + +#include <string> +#include <vector> + +std::vector<std::string> simple_tokenize(const std::string &); diff --git a/llama.cpp/tests/peg-parser/test-basic.cpp b/llama.cpp/tests/peg-parser/test-basic.cpp new file mode 100644 index 0000000..1bda6f2 --- /dev/null +++ b/llama.cpp/tests/peg-parser/test-basic.cpp @@ -0,0 +1,454 @@ +#include "tests.h" + +void test_basic(testing & t) { + t.test("chars", [](testing & t) { + // Test common escape sequences - newline + t.test("escape_sequence_newline", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("\n"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escape_sequence_newline", true, result.success()); + }); + + // Test common escape sequences - tab + t.test("escape_sequence_tab", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("\t"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escape_sequence_tab", true, result.success()); + }); + + // Test common escape sequences - backslash + t.test("escape_sequence_backslash", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("\\"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escape_sequence_backslash", true, result.success()); + }); + + // Test common escape sequences - space (should ()) + t.test("escape_sequence_space_fail", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[\\n\\t\\\\]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context(" "); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escape_sequence_space_fail", true, result.fail()); + }); + + // Test escaped dash - 'a' should succeed + t.test("escaped_dash_a", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("a"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escaped_dash_a", true, result.success()); + }); + + // Test escaped dash - '-' should succeed (literal dash) + t.test("escaped_dash_literal", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("-"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escaped_dash_literal", true, result.success()); + }); + + // Test escaped dash - 'z' should succeed + t.test("escaped_dash_z", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("z"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escaped_dash_z", true, result.success()); + }); + + // Test escaped dash - 'b' should NOT match (since \- is literal dash, not range) + t.test("escaped_dash_b_fail", [](testing &t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a\\-z]"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("b"); + result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("escaped_dash_b_fail", true, result.fail()); + }); + }); + + + t.test("optional", [](testing & t) { + // Full match with optional part present + t.test("optional_present", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello") + p.optional(p.literal(" world")); + }); + + auto ctx = common_peg_parse_context("hello world"); + auto result = parser.parse(ctx); + t.assert_equal("optional_present", true, result.success()); + t.assert_equal("optional_present_end", 11u, result.end); + }); + + // Full match with optional part absent + t.test("optional_absent", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello") + p.optional(p.literal(" world")); + }); + + auto ctx = common_peg_parse_context("hello", false); + auto result = parser.parse(ctx); + t.assert_equal("optional_absent", true, result.success()); + t.assert_equal("optional_absent_end", 5u, result.end); + }); + + // Partial match - waiting for more input to determine if optional matches + t.test("partial_match_need_more", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello") + p.optional(p.literal(" world")); + }); + + auto ctx = common_peg_parse_context("hello ", true); + auto result = parser.parse(ctx); + t.assert_equal("partial_match_need_more", true, result.need_more_input()); + }); + }); + + t.test("partial parsing", [](testing & t) { + // Literals - Basic Success + t.test("literal_success", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("hello"); + result = parser.parse(ctx); + t.assert_equal("literal_success", true, result.success()); + }); + + // Char Classes - Basic Lowercase Success + t.test("char_class_lowercase_success", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("a"); + result = parser.parse(ctx); + t.assert_equal("char_class_lowercase_success", true, result.success()); + }); + + // Char Classes - Uppercase Fail + t.test("char_class_uppercase_fail", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("A"); + result = parser.parse(ctx); + t.assert_equal("char_class_uppercase_fail", true, result.fail()); + }); + + // Char Classes with Dash - Lowercase Success + t.test("char_class_with_dash_lowercase", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("f"); + result = parser.parse(ctx); + t.assert_equal("char_class_with_dash_lowercase", true, result.success()); + }); + + // Char Classes with Dash - Literal Dash Success + t.test("char_class_with_dash_literal_dash", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("-"); + result = parser.parse(ctx); + t.assert_equal("char_class_with_dash_literal_dash", true, result.success()); + }); + + // Char Classes with Dash - Uppercase Fail + t.test("char_class_with_dash_uppercase_fail", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("a-z-"); }); + + common_peg_parse_context ctx; + common_peg_parse_result result; + + ctx = common_peg_parse_context("A"); + result = parser.parse(ctx); + t.assert_equal("char_class_with_dash_uppercase_fail", true, result.fail()); + }); + + // Sequences - Partial Match 1 + t.test("sequence_partial_match_1", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); }); + + auto ctx = common_peg_parse_context("<thi", true); + auto result = parser.parse(ctx); + t.assert_equal("sequence_partial_match_1", true, result.need_more_input()); + }); + + // Sequences - Partial Match 2 + t.test("sequence_partial_match_2", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("begin") + p.literal("end"); }); + + auto ctx = common_peg_parse_context("begin", true); + auto result = parser.parse(ctx); + t.assert_equal("sequence_partial_match_2", true, result.need_more_input()); + }); + + // Sequences - Partial Match 3 + t.test("sequence_partial_match_3", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); }); + + auto ctx = common_peg_parse_context("<think></", true); + auto result = parser.parse(ctx); + t.assert_equal("sequence_partial_match_3", true, result.need_more_input()); + }); + + // Sequences - Full Match + t.test("sequence_full_match", [&](testing & t) { + auto common_chat_combinator_parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal("world"); }); + + auto ctx = common_peg_parse_context("helloworld", false); + auto result = common_chat_combinator_parser.parse(ctx); + t.assert_equal("sequence_full_match", true, result.success()); + }); + + // Sequences - No Match + t.test("sequence_no_match", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("<think>") + p.literal("</think>"); }); + + auto ctx = common_peg_parse_context("<think>I am common_chat_combinator_parser", true); + auto result = parser.parse(ctx); + t.assert_equal("sequence_no_match", true, result.fail()); + }); + + // Choices - Partial Match 1 + t.test("choices_partial_match_1", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("option1") | p.literal("option2"); }); + + auto ctx = common_peg_parse_context("opt", true); + auto result = parser.parse(ctx); + t.assert_equal("choices_partial_match_1", true, result.need_more_input()); + }); + + // Choices - Partial Match 2 + t.test("choices_partial_match_2", [&](testing & t) { + auto parser = + build_peg_parser([](common_peg_parser_builder & p) { return p.literal("choice_a") | p.literal("choice_b"); }); + + auto ctx = common_peg_parse_context("choice", true); + auto result = parser.parse(ctx); + t.assert_equal("choices_partial_match_2", true, result.need_more_input()); + }); + + // Choices - Full Match 1 + t.test("choices_full_match_1", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("first") | p.literal("second"); }); + + auto ctx = common_peg_parse_context("first", false); + auto result = parser.parse(ctx); + t.assert_equal("choices_full_match_1", true, result.success()); + }); + + // Choices - Full Match 2 + t.test("choices_full_match_2", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("alpha") | p.literal("beta"); }); + + auto ctx = common_peg_parse_context("beta", false); + auto result = parser.parse(ctx); + t.assert_equal("choices_full_match_2", true, result.success()); + }); + + // Choices - No Match + t.test("choices_no_match", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("good") | p.literal("better"); }); + + auto ctx = common_peg_parse_context("best", false); + auto result = parser.parse(ctx); + t.assert_equal("choices_no_match", true, result.fail()); + }); + + // Zero or More - Partial Match 1 + t.test("zero_or_more_partial_match_1", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("ab")); }); + + auto ctx = common_peg_parse_context("a", true); + auto result = parser.parse(ctx); + t.assert_equal("zero_or_more_partial_match_1", true, result.need_more_input()); + }); + + // Zero or More - Partial Match 2 + t.test("zero_or_more_partial_match_2", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("xy")); }); + + auto ctx = common_peg_parse_context("xyx", true); + auto result = parser.parse(ctx); + t.assert_equal("zero_or_more_partial_match_2", true, result.need_more_input()); + }); + + // Zero or More - Full Match + t.test("zero_or_more_full_match", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("test")); }); + + auto ctx = common_peg_parse_context("test", false); + auto result = parser.parse(ctx); + t.assert_equal("zero_or_more_full_match", true, result.success()); + }); + + // One or More - Partial Match 1 + t.test("one_or_more_partial_match_1", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("repeat")); }); + + auto ctx = common_peg_parse_context("rep", true); + auto result = parser.parse(ctx); + t.assert_equal("one_or_more_partial_match_1", true, result.need_more_input()); + }); + + // One or More - Partial Match 2 + t.test("one_or_more_partial_match_2", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("ab")); }); + + auto ctx = common_peg_parse_context("aba", true); + auto result = parser.parse(ctx); + t.assert_equal("one_or_more_partial_match_2", true, result.need_more_input()); + }); + + // One or More - Full Match + t.test("one_or_more_full_match", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("single")); }); + + auto ctx = common_peg_parse_context("single", false); + auto result = parser.parse(ctx); + t.assert_equal("one_or_more_full_match", true, result.success()); + }); + + // One or More - No Match + t.test("one_or_more_no_match", [&](testing & t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("()")); }); + + auto ctx = common_peg_parse_context("success", false); + auto result = parser.parse(ctx); + t.assert_equal("one_or_more_no_match", true, result.fail()); + }); + }); + + + t.test("recursive rules", [](testing &t) { + // Test simple number + t.test("simple_number", [](testing &t) { + auto value_parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("number", p.chars("0-9")); + p.rule("list", p.literal("[") + p.ref("value") + p.literal("]")); + return p.rule("value", p.ref("number") | p.ref("list")); + }); + + common_peg_parse_context ctx("1", false); + auto result = value_parser.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + }); + + // Test simple list + t.test("simple_list", [](testing &t) { + auto value_parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("number", p.chars("0-9")); + p.rule("list", p.literal("[") + p.ref("value") + p.literal("]")); + return p.rule("value", p.ref("number") | p.ref("list")); + }); + + common_peg_parse_context ctx("[1]", false); + auto result = value_parser.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + }); + + // Test nested list + t.test("nested_list", [](testing &t) { + auto value_parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("number", p.chars("0-9")); + p.rule("list", p.literal("[") + p.ref("value") + p.literal("]")); + return p.rule("value", p.ref("number") | p.ref("list")); + }); + + common_peg_parse_context ctx("[[2]]", false); + auto result = value_parser.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + }); + + // Test deeply nested list + t.test("deeply_nested_list", [](testing &t) { + auto value_parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("number", p.chars("0-9")); + p.rule("list", p.literal("[") + p.ref("value") + p.literal("]")); + return p.rule("value", p.ref("number") | p.ref("list")); + }); + + common_peg_parse_context ctx("[[[3]]]", false); + auto result = value_parser.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + }); + + // Test need_more_input match + t.test("need_more_input_match", [](testing &t) { + auto value_parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("number", p.chars("0-9")); + p.rule("list", p.literal("[") + p.ref("value") + p.literal("]")); + return p.rule("value", p.ref("number") | p.ref("list")); + }); + + common_peg_parse_context ctx("[[", true); + auto result = value_parser.parse(ctx); + + t.assert_equal("result_is_need_more_input", true, result.need_more_input()); + }); + + // Test no match + t.test("no_match", [](testing &t) { + auto value_parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("number", p.chars("0-9")); + p.rule("list", p.literal("[") + p.ref("value") + p.literal("]")); + return p.rule("value", p.ref("number") | p.ref("list")); + }); + + common_peg_parse_context ctx("[a]", false); + auto result = value_parser.parse(ctx); + + t.assert_equal("result_is_fail", true, result.fail()); + }); + }); +} diff --git a/llama.cpp/tests/peg-parser/test-gbnf-generation.cpp b/llama.cpp/tests/peg-parser/test-gbnf-generation.cpp new file mode 100644 index 0000000..68857a5 --- /dev/null +++ b/llama.cpp/tests/peg-parser/test-gbnf-generation.cpp @@ -0,0 +1,250 @@ +#include "tests.h" + +#include "json-schema-to-grammar.h" + +#include <regex> + +static std::string trim_leading_space(const std::string & s) { + static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)"); + return std::regex_replace(s, leading_ws_re, "$1"); +} + +static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) { + t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual)); +} + +void test_gbnf_generation(testing &t) { + t.test("literal grammar generation", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "hello" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("char class grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.chars("[a-z]", 1, 1); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= [a-z] + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("sequence grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello") + p.literal(" ") + p.literal("world"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "hello" " " "world" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("choice grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("cat") | p.literal("dog"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "cat" | "dog" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("one_or_more grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.one_or_more(p.literal("a")); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "a"+ + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("zero_or_more grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.zero_or_more(p.literal("a")); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "a"* + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("optional grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello") + p.optional(p.literal(" world")); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "hello" " world"? + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("until grammar", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.until("</tag>"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= ([^<] | "<" [^/] | "</" [^t] | "</t" [^a] | "</ta" [^g] | "</tag" [^>])* + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("complex expressions with parentheses", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.one_or_more(p.literal("a") | p.literal("b")); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= ("a" | "b")+ + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("rule references", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + auto digit = p.rule("digit", p.chars("[0-9]", 1, 1)); + return p.one_or_more(digit); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + digit ::= [0-9] + root ::= digit+ + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("escaping in literals", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello\nworld\n!"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "hello\nworld\n!" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("operator<< (whitespace insertion)", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.literal("hello") << p.literal("world"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= "hello" space "world" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("emit only reachable rules", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + p.rule("orphan", p.literal("orphan")); + return p.literal("hello") + p.rule("child", p.literal(" world")); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + child ::= " world" + root ::= "hello" child + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("emit only trigger rules (and references)", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + auto rule1 = p.rule("rule-1", p.literal("a") + p.ref("rule-2")); + p.rule("rule-2", p.literal("b") + p.ref("rule-3"), true); + p.rule("rule-3", p.literal("c") + p.ref("rule-4")); + p.rule("rule-4", p.literal("d"), true); + return rule1; + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= rule-1 + rule-1 ::= "a" rule-2 + rule-2 ::= "b" rule-3 + rule-3 ::= "c" rule-4 + rule-4 ::= "d" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + + auto gbnf_lazy = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder, true); + }); + + assert_gbnf_equal(t, R"""( + root ::= rule-2 | rule-4 + rule-2 ::= "b" rule-3 + rule-3 ::= "c" rule-4 + rule-4 ::= "d" + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf_lazy); + }); +} diff --git a/llama.cpp/tests/peg-parser/test-json-parser.cpp b/llama.cpp/tests/peg-parser/test-json-parser.cpp new file mode 100644 index 0000000..48351cd --- /dev/null +++ b/llama.cpp/tests/peg-parser/test-json-parser.cpp @@ -0,0 +1,109 @@ +#include "tests.h" + +void test_json_parser(testing &t) { + // Test parsing a simple JSON object + t.test("simple JSON object parsing", [](testing &t) { + auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); + + std::string input = R"({"name": "test", "value": 42, "flag": true})"; + common_peg_parse_context ctx(input); + + auto result = json.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + t.assert_equal("result_end", input.size(), result.end); + }); + + // Test parsing a JSON array with mixed types + t.test("JSON array with mixed types", [](testing &t) { + auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); + + std::string input = R"([1, "hello", true, null, 3.14])"; + common_peg_parse_context ctx(input); + + auto result = json.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + t.assert_equal("result_end", input.size(), result.end); + }); + + // Test parsing nested JSON with objects and arrays + t.test("nested JSON with objects and arrays", [](testing &t) { + auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); + + std::string input = + R"({"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "count": 2, "metadata": {"version": "1.0", "tags": ["admin", "user"]}})"; + common_peg_parse_context ctx(input); + + auto result = json.parse(ctx); + + t.assert_equal("result_is_success", true, result.success()); + t.assert_equal("result_end", input.size(), result.end); + }); + + // Test need_more_input() parsing - incomplete object + t.test("need_more_input() parsing - incomplete object", [](testing &t) { + auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); + + std::string input = R"({"name": "test", "value": )"; + common_peg_parse_context ctx(input, true); + + auto result = json.parse(ctx); + + t.assert_equal("result_is_need_more_input", true, result.need_more_input()); + }); + + // Test need_more_input() parsing - incomplete array + t.test("need_more_input() parsing - incomplete array", [](testing &t) { + auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); + + std::string input = R"([1, 2, 3, )"; + common_peg_parse_context ctx(input, true); + + auto result = json.parse(ctx); + + t.assert_equal("result_is_need_more_input", true, result.need_more_input()); + }); + + // Test need_more_input() parsing - incomplete nested structure + t.test("need_more_input() parsing - incomplete nested structure", [](testing &t) { + auto json = build_peg_parser([](common_peg_parser_builder & p) { return p.json(); }); + + std::string input = R"({"data": {"nested": )"; + common_peg_parse_context ctx(input, true); + + auto result = json.parse(ctx); + + t.assert_equal("result_is_need_more_input", true, result.need_more_input()); + }); + + t.test("object member", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.json_member("name", "\"" + p.chars("[a-z]") + "\""); + }); + + t.test("success", [&](testing &t) { + std::string input = R"("name": "bob")"; + common_peg_parse_context ctx(input, false); + + auto result = parser.parse(ctx); + t.assert_true("success", result.success()); + }); + + t.test("partial", [&](testing &t) { + std::string input = R"("name": "bo)"; + common_peg_parse_context ctx(input, true); + + auto result = parser.parse(ctx); + t.assert_true("need more input", result.need_more_input()); + }); + + t.test("failed", [&](testing &t) { + std::string input = R"([])"; + common_peg_parse_context ctx(input, false); + + auto result = parser.parse(ctx); + t.assert_true("fail", result.fail()); + }); + }); +} diff --git a/llama.cpp/tests/peg-parser/test-json-serialization.cpp b/llama.cpp/tests/peg-parser/test-json-serialization.cpp new file mode 100644 index 0000000..a858010 --- /dev/null +++ b/llama.cpp/tests/peg-parser/test-json-serialization.cpp @@ -0,0 +1,28 @@ +#include "tests.h" + +void test_json_serialization(testing &t) { + auto original = build_peg_parser([](common_peg_parser_builder & p) { + return "<tool_call>" + p.json() + "</tool_call>"; + }); + + auto json_serialized = original.to_json().dump(); + + t.test("compare before/after", [&](testing &t) { + auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized)); + + // Test complex JSON + std::string input = R"({"name": "test", "values": [1, 2, 3], "nested": {"a": true}})"; + common_peg_parse_context ctx1(input); + common_peg_parse_context ctx2(input); + + auto result1 = original.parse(ctx1); + auto result2 = deserialized.parse(ctx2); + + t.assert_equal("both_succeed", result1.success(), result2.success()); + t.assert_equal("same_end_pos", result1.end, result2.end); + }); + + t.bench("deserialize", [&]() { + auto deserialized = common_peg_arena::from_json(nlohmann::json::parse(json_serialized)); + }, 100); +} diff --git a/llama.cpp/tests/peg-parser/test-unicode.cpp b/llama.cpp/tests/peg-parser/test-unicode.cpp new file mode 100644 index 0000000..19d9b9e --- /dev/null +++ b/llama.cpp/tests/peg-parser/test-unicode.cpp @@ -0,0 +1,449 @@ +#include "tests.h" + +#include "peg-parser.h" + +#include <string> +#include <sstream> +#include <iomanip> +#include <cctype> + +static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) { + t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual)); +} + +static std::string hex_dump(const std::string& str) { + std::ostringstream oss; + for (unsigned char c : str) { + if (std::isprint(c)) { + oss << c; + } else { + oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c); + } + } + return oss.str(); +} + +void test_unicode(testing &t) { + struct test_case { + std::string input; + std::string expected_text; + common_peg_parse_result_type expected_result; + }; + + t.test("any", [](testing &t) { + std::vector<test_case> test_cases { + // Valid UTF-8 sequences + {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS}, + {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Incomplete UTF-8 sequences (partial bytes at end) + {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + + // Invalid/malformed UTF-8 sequences + {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL}, + {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.sequence({p.one_or_more(p.any()), p.end()}); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, true); + auto result = parser.parse(ctx); + + // Assert result type matches + assert_result_equal(t, tc.expected_result, result.type); + + // Assert matched text if success or need_more_input + if (result.success() || result.need_more_input()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("char classes", [](testing &t) { + t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) { + std::vector<test_case> test_cases { + // Within range - CJK Unified Ideographs + {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00 + {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60 + {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D + {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF + + // Outside range - should fail + {"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII + {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range) + {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range) + + // Incomplete sequences in range + {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00 + {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()}); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, true); + auto result = parser.parse(ctx); + + // Assert result type matches + assert_result_equal(t, tc.expected_result, result.type); + + // Assert matched text if success or need_more_input + if (result.success() || result.need_more_input()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) { + std::vector<test_case> test_cases { + // Within range - Emoticons (all 4-byte UTF-8) + {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600 + {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601 + {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F + + // Outside range + {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range) + {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range) + {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range) + + // Incomplete sequences + {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji + {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()}); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, true); + auto result = parser.parse(ctx); + + // Assert result type matches + assert_result_equal(t, tc.expected_result, result.type); + + // Assert matched text if success or need_more_input + if (result.success() || result.need_more_input()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("mixed unicode ranges", [](testing &t) { + std::vector<test_case> test_cases { + // Match CJK + {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00 + {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60 + + // Match emoticons + {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600 + + // Match ASCII digits + {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Don't match outside any range + {"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, + {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 + + // Incomplete + {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()}); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, true); + auto result = parser.parse(ctx); + + // Assert result type matches + assert_result_equal(t, tc.expected_result, result.type); + + // Assert matched text if success or need_more_input + if (result.success() || result.need_more_input()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + }); + + t.test("until parser", [](testing &t) { + t.test("ASCII delimiter with Unicode content", [](testing &t) { + std::vector<test_case> test_cases { + // CJK characters before delimiter + {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Emoji before delimiter + {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Mixed content + {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.until("</tag>"); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, false); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + + if (result.success()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("incomplete UTF-8 at end", [](testing &t) { + std::vector<test_case> test_cases { + // Incomplete emoji at end, no delimiter + {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + + // Incomplete CJK at end, no delimiter + {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + + // Complete content, no delimiter (should consume all valid UTF-8) + {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.until("</tag>"); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, true); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + + if (result.success() || result.need_more_input()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("malformed UTF-8", [](testing &t) { + std::vector<test_case> test_cases { + // Invalid UTF-8 bytes + {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + + // Continuation byte without lead byte + {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + + // Invalid continuation byte + {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + }; + + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.until("</tag>"); + }); + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + common_peg_parse_context ctx(tc.input, false); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + }); + } + }); + }); + + t.test("json_string parser", [](testing &t) { + t.test("valid UTF-8 characters", [](testing &t) { + std::vector<test_case> test_cases { + // ASCII only + {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // 2-byte UTF-8 (accented characters) + {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // 3-byte UTF-8 (CJK) + {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // 4-byte UTF-8 (emoji) + {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Mixed content + {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + }; + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.sequence({p.json_string_content(), p.literal("\"")}); + }); + + common_peg_parse_context ctx(tc.input, false); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + + if (result.success()) { + std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("incomplete UTF-8", [](testing &t) { + std::vector<test_case> test_cases { + // Incomplete 2-byte sequence + {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + + // Incomplete 3-byte sequence + {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + + // Incomplete 4-byte sequence + {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + + // Incomplete at very start + {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, + }; + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.json_string_content(); + }); + + common_peg_parse_context ctx(tc.input, true); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + + if (result.need_more_input()) { + std::string matched = tc.input.substr(result.start, result.end - result.start); + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + + t.test("malformed UTF-8", [](testing &t) { + std::vector<test_case> test_cases { + // Invalid UTF-8 bytes + {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + + // Continuation byte without lead byte + {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + + // Invalid continuation byte + {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + + // Overlong encoding (security issue) + {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, + }; + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.json_string_content(); + }); + + common_peg_parse_context ctx(tc.input, false); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + }); + } + }); + + t.test("escape sequences with UTF-8", [](testing &t) { + std::vector<test_case> test_cases { + // Unicode escape sequence + {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Mix of UTF-8 and escape sequences + {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + + // Escaped quote in UTF-8 string + {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, + }; + + for (size_t i = 0; i < test_cases.size(); i++) { + const auto & tc = test_cases[i]; + std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input); + + t.test(test_name, [&](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder& p) { + return p.sequence({p.json_string_content(), p.literal("\"")}); + }); + + common_peg_parse_context ctx(tc.input, false); + auto result = parser.parse(ctx); + + assert_result_equal(t, tc.expected_result, result.type); + + if (result.success()) { + std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote + t.assert_equal(tc.expected_text, matched); + } + }); + } + }); + }); +} diff --git a/llama.cpp/tests/peg-parser/tests.h b/llama.cpp/tests/peg-parser/tests.h new file mode 100644 index 0000000..4d3f4e9 --- /dev/null +++ b/llama.cpp/tests/peg-parser/tests.h @@ -0,0 +1,24 @@ +#pragma once + +// Common includes for all test files +#include <nlohmann/json.hpp> +#include <string> +#include <vector> + +#include "../testing.h" +#include "peg-parser.h" +#include "chat-peg-parser.h" +#include "simple-tokenize.h" + +struct bench_tool_call { + std::string id; + std::string name; + nlohmann::ordered_json args; +}; + +// Test function declarations +void test_basic(testing &t); +void test_json_parser(testing &t); +void test_gbnf_generation(testing &t); +void test_unicode(testing &t); +void test_json_serialization(testing &t); |
