1#include "tests.h"
  2
  3#include "peg-parser.h"
  4
  5#include <string>
  6#include <sstream>
  7#include <iomanip>
  8#include <cctype>
  9
 10static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
 11    t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
 12}
 13
 14static std::string hex_dump(const std::string& str) {
 15    std::ostringstream oss;
 16    for (unsigned char c : str) {
 17        if (std::isprint(c)) {
 18            oss << c;
 19        } else {
 20            oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
 21        }
 22    }
 23    return oss.str();
 24}
 25
 26void test_unicode(testing &t) {
 27    struct test_case {
 28        std::string input;
 29        std::string expected_text;
 30        common_peg_parse_result_type expected_result;
 31    };
 32
 33    t.test("any", [](testing &t) {
 34        std::vector<test_case> test_cases {
 35            // Valid UTF-8 sequences
 36            {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
 37            {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
 38            {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
 39            {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
 40
 41            // Incomplete UTF-8 sequences (partial bytes at end)
 42            {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
 43            {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
 44            {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
 45
 46            // Invalid/malformed UTF-8 sequences
 47            {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
 48            {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
 49            {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
 50        };
 51
 52        auto parser = build_peg_parser([](common_peg_parser_builder& p) {
 53            return p.sequence({p.one_or_more(p.any()), p.end()});
 54        });
 55
 56        for (size_t i = 0; i < test_cases.size(); i++) {
 57            const auto & tc = test_cases[i];
 58            std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
 59
 60            t.test(test_name, [&](testing &t) {
 61                common_peg_parse_context ctx(tc.input, true);
 62                auto result = parser.parse(ctx);
 63
 64                // Assert result type matches
 65                assert_result_equal(t, tc.expected_result, result.type);
 66
 67                // Assert matched text if success or need_more_input
 68                if (result.success() || result.need_more_input()) {
 69                    std::string matched = tc.input.substr(result.start, result.end - result.start);
 70                    t.assert_equal(tc.expected_text, matched);
 71                }
 72            });
 73        }
 74    });
 75
 76    t.test("char classes", [](testing &t) {
 77        t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
 78            std::vector<test_case> test_cases {
 79                // Within range - CJK Unified Ideographs
 80                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
 81                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
 82                {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
 83                {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
 84
 85                // Outside range - should fail
 86                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},                                                     // ASCII
 87                {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+4DFF (before range)
 88                {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},                            // U+A000 (after range)
 89
 90                // Incomplete sequences in range
 91                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+4E00
 92                {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},                     // Incomplete U+597D
 93            };
 94
 95            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
 96                return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
 97            });
 98
 99            for (size_t i = 0; i < test_cases.size(); i++) {
100                const auto & tc = test_cases[i];
101                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
102
103                t.test(test_name, [&](testing &t) {
104                    common_peg_parse_context ctx(tc.input, true);
105                    auto result = parser.parse(ctx);
106
107                    // Assert result type matches
108                    assert_result_equal(t, tc.expected_result, result.type);
109
110                    // Assert matched text if success or need_more_input
111                    if (result.success() || result.need_more_input()) {
112                        std::string matched = tc.input.substr(result.start, result.end - result.start);
113                        t.assert_equal(tc.expected_text, matched);
114                    }
115                });
116            }
117        });
118
119        t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
120            std::vector<test_case> test_cases {
121                // Within range - Emoticons (all 4-byte UTF-8)
122                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
123                {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
124                {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
125
126                // Outside range
127                {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
128                {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
129                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
130
131                // Incomplete sequences
132                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
133                {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},     // Very incomplete
134            };
135
136            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
137                return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
138            });
139
140            for (size_t i = 0; i < test_cases.size(); i++) {
141                const auto & tc = test_cases[i];
142                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
143
144                t.test(test_name, [&](testing &t) {
145                    common_peg_parse_context ctx(tc.input, true);
146                    auto result = parser.parse(ctx);
147
148                    // Assert result type matches
149                    assert_result_equal(t, tc.expected_result, result.type);
150
151                    // Assert matched text if success or need_more_input
152                    if (result.success() || result.need_more_input()) {
153                        std::string matched = tc.input.substr(result.start, result.end - result.start);
154                        t.assert_equal(tc.expected_text, matched);
155                    }
156                });
157            }
158        });
159
160        t.test("mixed unicode ranges", [](testing &t) {
161            std::vector<test_case> test_cases {
162                // Match CJK
163                {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
164                {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
165
166                // Match emoticons
167                {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
168
169                // Match ASCII digits
170                {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
171
172                // Don't match outside any range
173                {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
174                {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
175
176                // Incomplete
177                {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
178                {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
179            };
180
181            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
182                return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
183            });
184
185            for (size_t i = 0; i < test_cases.size(); i++) {
186                const auto & tc = test_cases[i];
187                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
188
189                t.test(test_name, [&](testing &t) {
190                    common_peg_parse_context ctx(tc.input, true);
191                    auto result = parser.parse(ctx);
192
193                    // Assert result type matches
194                    assert_result_equal(t, tc.expected_result, result.type);
195
196                    // Assert matched text if success or need_more_input
197                    if (result.success() || result.need_more_input()) {
198                        std::string matched = tc.input.substr(result.start, result.end - result.start);
199                        t.assert_equal(tc.expected_text, matched);
200                    }
201                });
202            }
203        });
204    });
205
206    t.test("until parser", [](testing &t) {
207        t.test("ASCII delimiter with Unicode content", [](testing &t) {
208            std::vector<test_case> test_cases {
209                // CJK characters before delimiter
210                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
211
212                // Emoji before delimiter
213                {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
214
215                // Mixed content
216                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
217            };
218
219            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
220                return p.until("</tag>");
221            });
222
223            for (size_t i = 0; i < test_cases.size(); i++) {
224                const auto & tc = test_cases[i];
225                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
226
227                t.test(test_name, [&](testing &t) {
228                    common_peg_parse_context ctx(tc.input, false);
229                    auto result = parser.parse(ctx);
230
231                    assert_result_equal(t, tc.expected_result, result.type);
232
233                    if (result.success()) {
234                        std::string matched = tc.input.substr(result.start, result.end - result.start);
235                        t.assert_equal(tc.expected_text, matched);
236                    }
237                });
238            }
239        });
240
241        t.test("incomplete UTF-8 at end", [](testing &t) {
242            std::vector<test_case> test_cases {
243                // Incomplete emoji at end, no delimiter
244                {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
245
246                // Incomplete CJK at end, no delimiter
247                {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
248
249                // Complete content, no delimiter (should consume all valid UTF-8)
250                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
251            };
252
253            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
254                return p.until("</tag>");
255            });
256
257            for (size_t i = 0; i < test_cases.size(); i++) {
258                const auto & tc = test_cases[i];
259                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
260
261                t.test(test_name, [&](testing &t) {
262                    common_peg_parse_context ctx(tc.input, true);
263                    auto result = parser.parse(ctx);
264
265                    assert_result_equal(t, tc.expected_result, result.type);
266
267                    if (result.success() || result.need_more_input()) {
268                        std::string matched = tc.input.substr(result.start, result.end - result.start);
269                        t.assert_equal(tc.expected_text, matched);
270                    }
271                });
272            }
273        });
274
275        t.test("malformed UTF-8", [](testing &t) {
276            std::vector<test_case> test_cases {
277                // Invalid UTF-8 bytes
278                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
279
280                // Continuation byte without lead byte
281                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
282
283                // Invalid continuation byte
284                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
285            };
286
287            auto parser = build_peg_parser([](common_peg_parser_builder& p) {
288                return p.until("</tag>");
289            });
290
291            for (size_t i = 0; i < test_cases.size(); i++) {
292                const auto & tc = test_cases[i];
293                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
294
295                t.test(test_name, [&](testing &t) {
296                    common_peg_parse_context ctx(tc.input, false);
297                    auto result = parser.parse(ctx);
298
299                    assert_result_equal(t, tc.expected_result, result.type);
300                });
301            }
302        });
303    });
304
305    t.test("json_string parser", [](testing &t) {
306        t.test("valid UTF-8 characters", [](testing &t) {
307            std::vector<test_case> test_cases {
308                // ASCII only
309                {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
310
311                // 2-byte UTF-8 (accented characters)
312                {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
313
314                // 3-byte UTF-8 (CJK)
315                {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
316
317                // 4-byte UTF-8 (emoji)
318                {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
319
320                // Mixed content
321                {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
322            };
323
324            for (size_t i = 0; i < test_cases.size(); i++) {
325                const auto & tc = test_cases[i];
326                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
327
328                t.test(test_name, [&](testing &t) {
329                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
330                        return p.sequence({p.json_string_content(), p.literal("\"")});
331                    });
332
333                    common_peg_parse_context ctx(tc.input, false);
334                    auto result = parser.parse(ctx);
335
336                    assert_result_equal(t, tc.expected_result, result.type);
337
338                    if (result.success()) {
339                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
340                        t.assert_equal(tc.expected_text, matched);
341                    }
342                });
343            }
344        });
345
346        t.test("incomplete UTF-8", [](testing &t) {
347            std::vector<test_case> test_cases {
348                // Incomplete 2-byte sequence
349                {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
350
351                // Incomplete 3-byte sequence
352                {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
353
354                // Incomplete 4-byte sequence
355                {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
356
357                // Incomplete at very start
358                {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
359            };
360
361            for (size_t i = 0; i < test_cases.size(); i++) {
362                const auto & tc = test_cases[i];
363                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
364
365                t.test(test_name, [&](testing &t) {
366                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
367                        return p.json_string_content();
368                    });
369
370                    common_peg_parse_context ctx(tc.input, true);
371                    auto result = parser.parse(ctx);
372
373                    assert_result_equal(t, tc.expected_result, result.type);
374
375                    if (result.need_more_input()) {
376                        std::string matched = tc.input.substr(result.start, result.end - result.start);
377                        t.assert_equal(tc.expected_text, matched);
378                    }
379                });
380            }
381        });
382
383        t.test("malformed UTF-8", [](testing &t) {
384            std::vector<test_case> test_cases {
385                // Invalid UTF-8 bytes
386                {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
387
388                // Continuation byte without lead byte
389                {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
390
391                // Invalid continuation byte
392                {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
393
394                // Overlong encoding (security issue)
395                {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
396            };
397
398            for (size_t i = 0; i < test_cases.size(); i++) {
399                const auto & tc = test_cases[i];
400                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
401
402                t.test(test_name, [&](testing &t) {
403                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
404                        return p.json_string_content();
405                    });
406
407                    common_peg_parse_context ctx(tc.input, false);
408                    auto result = parser.parse(ctx);
409
410                    assert_result_equal(t, tc.expected_result, result.type);
411                });
412            }
413        });
414
415        t.test("escape sequences with UTF-8", [](testing &t) {
416            std::vector<test_case> test_cases {
417                // Unicode escape sequence
418                {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
419
420                // Mix of UTF-8 and escape sequences
421                {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
422
423                // Escaped quote in UTF-8 string
424                {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
425            };
426
427            for (size_t i = 0; i < test_cases.size(); i++) {
428                const auto & tc = test_cases[i];
429                std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
430
431                t.test(test_name, [&](testing &t) {
432                    auto parser = build_peg_parser([](common_peg_parser_builder& p) {
433                        return p.sequence({p.json_string_content(), p.literal("\"")});
434                    });
435
436                    common_peg_parse_context ctx(tc.input, false);
437                    auto result = parser.parse(ctx);
438
439                    assert_result_equal(t, tc.expected_result, result.type);
440
441                    if (result.success()) {
442                        std::string matched = tc.input.substr(result.start, result.end - result.start - 1);  // -1 to exclude closing quote
443                        t.assert_equal(tc.expected_text, matched);
444                    }
445                });
446            }
447        });
448    });
449}