1#include "tests.h"
2
3#include "peg-parser.h"
4
5#include <string>
6#include <sstream>
7#include <iomanip>
8#include <cctype>
9
10static void assert_result_equal(testing & t, common_peg_parse_result_type expected, common_peg_parse_result_type actual) {
11 t.assert_equal(common_peg_parse_result_type_name(expected), common_peg_parse_result_type_name(actual));
12}
13
14static std::string hex_dump(const std::string& str) {
15 std::ostringstream oss;
16 for (unsigned char c : str) {
17 if (std::isprint(c)) {
18 oss << c;
19 } else {
20 oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << static_cast<int>(c);
21 }
22 }
23 return oss.str();
24}
25
26void test_unicode(testing &t) {
27 struct test_case {
28 std::string input;
29 std::string expected_text;
30 common_peg_parse_result_type expected_result;
31 };
32
33 t.test("any", [](testing &t) {
34 std::vector<test_case> test_cases {
35 // Valid UTF-8 sequences
36 {"Hello", "Hello", COMMON_PEG_PARSE_RESULT_SUCCESS},
37 {std::string("Caf\xC3\xA9"), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
38 {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
39 {std::string("\xF0\x9F\x9A\x80"), std::string("\xF0\x9F\x9A\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
40
41 // Incomplete UTF-8 sequences (partial bytes at end)
42 {std::string("Caf\xC3"), "Caf", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
43 {std::string("\xE4\xBD"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
44 {std::string("\xF0\x9F\x9A"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
45
46 // Invalid/malformed UTF-8 sequences
47 {std::string("\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
48 {std::string("Hello\x80World"), "Hello", COMMON_PEG_PARSE_RESULT_FAIL},
49 {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
50 };
51
52 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
53 return p.sequence({p.one_or_more(p.any()), p.end()});
54 });
55
56 for (size_t i = 0; i < test_cases.size(); i++) {
57 const auto & tc = test_cases[i];
58 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
59
60 t.test(test_name, [&](testing &t) {
61 common_peg_parse_context ctx(tc.input, true);
62 auto result = parser.parse(ctx);
63
64 // Assert result type matches
65 assert_result_equal(t, tc.expected_result, result.type);
66
67 // Assert matched text if success or need_more_input
68 if (result.success() || result.need_more_input()) {
69 std::string matched = tc.input.substr(result.start, result.end - result.start);
70 t.assert_equal(tc.expected_text, matched);
71 }
72 });
73 }
74 });
75
76 t.test("char classes", [](testing &t) {
77 t.test("unicode range U+4E00-U+9FFF (CJK)", [](testing &t) {
78 std::vector<test_case> test_cases {
79 // Within range - CJK Unified Ideographs
80 {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
81 {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
82 {std::string("\xE5\xA5\xBD"), std::string("\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+597D
83 {std::string("\xE9\xBF\xBF"), std::string("\xE9\xBF\xBF"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+9FFF
84
85 // Outside range - should fail
86 {"a", "", COMMON_PEG_PARSE_RESULT_FAIL}, // ASCII
87 {std::string("\xE4\xB7\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+4DFF (before range)
88 {std::string("\xEA\x80\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+A000 (after range)
89
90 // Incomplete sequences in range
91 {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+4E00
92 {std::string("\xE5\xA5"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete U+597D
93 };
94
95 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
96 return p.sequence({p.chars(R"([\u4E00-\u9FFF])"), p.end()});
97 });
98
99 for (size_t i = 0; i < test_cases.size(); i++) {
100 const auto & tc = test_cases[i];
101 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
102
103 t.test(test_name, [&](testing &t) {
104 common_peg_parse_context ctx(tc.input, true);
105 auto result = parser.parse(ctx);
106
107 // Assert result type matches
108 assert_result_equal(t, tc.expected_result, result.type);
109
110 // Assert matched text if success or need_more_input
111 if (result.success() || result.need_more_input()) {
112 std::string matched = tc.input.substr(result.start, result.end - result.start);
113 t.assert_equal(tc.expected_text, matched);
114 }
115 });
116 }
117 });
118
119 t.test("unicode range U+1F600-U+1F64F (emoticons)", [](testing &t) {
120 std::vector<test_case> test_cases {
121 // Within range - Emoticons (all 4-byte UTF-8)
122 {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
123 {std::string("\xF0\x9F\x98\x81"), std::string("\xF0\x9F\x98\x81"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F601
124 {std::string("\xF0\x9F\x99\x8F"), std::string("\xF0\x9F\x99\x8F"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F64F
125
126 // Outside range
127 {std::string("\xF0\x9F\x97\xBF"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F5FF (before range)
128 {std::string("\xF0\x9F\x99\x90"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F650 (after range)
129 {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680 (outside range)
130
131 // Incomplete sequences
132 {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Incomplete emoji
133 {std::string("\xF0\x9F"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT}, // Very incomplete
134 };
135
136 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
137 return p.sequence({p.chars(R"([\U0001F600-\U0001F64F])"), p.end()});
138 });
139
140 for (size_t i = 0; i < test_cases.size(); i++) {
141 const auto & tc = test_cases[i];
142 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
143
144 t.test(test_name, [&](testing &t) {
145 common_peg_parse_context ctx(tc.input, true);
146 auto result = parser.parse(ctx);
147
148 // Assert result type matches
149 assert_result_equal(t, tc.expected_result, result.type);
150
151 // Assert matched text if success or need_more_input
152 if (result.success() || result.need_more_input()) {
153 std::string matched = tc.input.substr(result.start, result.end - result.start);
154 t.assert_equal(tc.expected_text, matched);
155 }
156 });
157 }
158 });
159
160 t.test("mixed unicode ranges", [](testing &t) {
161 std::vector<test_case> test_cases {
162 // Match CJK
163 {std::string("\xE4\xB8\x80"), std::string("\xE4\xB8\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4E00
164 {std::string("\xE4\xBD\xA0"), std::string("\xE4\xBD\xA0"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+4F60
165
166 // Match emoticons
167 {std::string("\xF0\x9F\x98\x80"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS}, // U+1F600
168
169 // Match ASCII digits
170 {"5", "5", COMMON_PEG_PARSE_RESULT_SUCCESS},
171
172 // Don't match outside any range
173 {"a", "", COMMON_PEG_PARSE_RESULT_FAIL},
174 {std::string("\xF0\x9F\x9A\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL}, // U+1F680
175
176 // Incomplete
177 {std::string("\xE4\xB8"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
178 {std::string("\xF0\x9F\x98"), "", COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
179 };
180
181 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
182 return p.sequence({p.chars(R"([\u4E00-\u9FFF\U0001F600-\U0001F64F0-9])"), p.end()});
183 });
184
185 for (size_t i = 0; i < test_cases.size(); i++) {
186 const auto & tc = test_cases[i];
187 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
188
189 t.test(test_name, [&](testing &t) {
190 common_peg_parse_context ctx(tc.input, true);
191 auto result = parser.parse(ctx);
192
193 // Assert result type matches
194 assert_result_equal(t, tc.expected_result, result.type);
195
196 // Assert matched text if success or need_more_input
197 if (result.success() || result.need_more_input()) {
198 std::string matched = tc.input.substr(result.start, result.end - result.start);
199 t.assert_equal(tc.expected_text, matched);
200 }
201 });
202 }
203 });
204 });
205
206 t.test("until parser", [](testing &t) {
207 t.test("ASCII delimiter with Unicode content", [](testing &t) {
208 std::vector<test_case> test_cases {
209 // CJK characters before delimiter
210 {std::string("\xE4\xBD\xA0\xE5\xA5\xBD</tag>"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
211
212 // Emoji before delimiter
213 {std::string("\xF0\x9F\x98\x80</tag>"), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
214
215 // Mixed content
216 {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!</tag>"), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
217 };
218
219 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
220 return p.until("</tag>");
221 });
222
223 for (size_t i = 0; i < test_cases.size(); i++) {
224 const auto & tc = test_cases[i];
225 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
226
227 t.test(test_name, [&](testing &t) {
228 common_peg_parse_context ctx(tc.input, false);
229 auto result = parser.parse(ctx);
230
231 assert_result_equal(t, tc.expected_result, result.type);
232
233 if (result.success()) {
234 std::string matched = tc.input.substr(result.start, result.end - result.start);
235 t.assert_equal(tc.expected_text, matched);
236 }
237 });
238 }
239 });
240
241 t.test("incomplete UTF-8 at end", [](testing &t) {
242 std::vector<test_case> test_cases {
243 // Incomplete emoji at end, no delimiter
244 {std::string("content\xF0\x9F\x98"), std::string("content"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
245
246 // Incomplete CJK at end, no delimiter
247 {std::string("hello\xE4\xB8"), std::string("hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
248
249 // Complete content, no delimiter (should consume all valid UTF-8)
250 {std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
251 };
252
253 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
254 return p.until("</tag>");
255 });
256
257 for (size_t i = 0; i < test_cases.size(); i++) {
258 const auto & tc = test_cases[i];
259 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
260
261 t.test(test_name, [&](testing &t) {
262 common_peg_parse_context ctx(tc.input, true);
263 auto result = parser.parse(ctx);
264
265 assert_result_equal(t, tc.expected_result, result.type);
266
267 if (result.success() || result.need_more_input()) {
268 std::string matched = tc.input.substr(result.start, result.end - result.start);
269 t.assert_equal(tc.expected_text, matched);
270 }
271 });
272 }
273 });
274
275 t.test("malformed UTF-8", [](testing &t) {
276 std::vector<test_case> test_cases {
277 // Invalid UTF-8 bytes
278 {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
279
280 // Continuation byte without lead byte
281 {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
282
283 // Invalid continuation byte
284 {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
285 };
286
287 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
288 return p.until("</tag>");
289 });
290
291 for (size_t i = 0; i < test_cases.size(); i++) {
292 const auto & tc = test_cases[i];
293 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
294
295 t.test(test_name, [&](testing &t) {
296 common_peg_parse_context ctx(tc.input, false);
297 auto result = parser.parse(ctx);
298
299 assert_result_equal(t, tc.expected_result, result.type);
300 });
301 }
302 });
303 });
304
305 t.test("json_string parser", [](testing &t) {
306 t.test("valid UTF-8 characters", [](testing &t) {
307 std::vector<test_case> test_cases {
308 // ASCII only
309 {"Hello World\"", "Hello World", COMMON_PEG_PARSE_RESULT_SUCCESS},
310
311 // 2-byte UTF-8 (accented characters)
312 {std::string("Caf\xC3\xA9\""), std::string("Caf\xC3\xA9"), COMMON_PEG_PARSE_RESULT_SUCCESS},
313
314 // 3-byte UTF-8 (CJK)
315 {std::string("\xE4\xBD\xA0\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
316
317 // 4-byte UTF-8 (emoji)
318 {std::string("\xF0\x9F\x98\x80\""), std::string("\xF0\x9F\x98\x80"), COMMON_PEG_PARSE_RESULT_SUCCESS},
319
320 // Mixed content
321 {std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!\""), std::string("Hello \xE4\xB8\x96\xE7\x95\x8C!"), COMMON_PEG_PARSE_RESULT_SUCCESS},
322 };
323
324 for (size_t i = 0; i < test_cases.size(); i++) {
325 const auto & tc = test_cases[i];
326 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
327
328 t.test(test_name, [&](testing &t) {
329 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
330 return p.sequence({p.json_string_content(), p.literal("\"")});
331 });
332
333 common_peg_parse_context ctx(tc.input, false);
334 auto result = parser.parse(ctx);
335
336 assert_result_equal(t, tc.expected_result, result.type);
337
338 if (result.success()) {
339 std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
340 t.assert_equal(tc.expected_text, matched);
341 }
342 });
343 }
344 });
345
346 t.test("incomplete UTF-8", [](testing &t) {
347 std::vector<test_case> test_cases {
348 // Incomplete 2-byte sequence
349 {std::string("Caf\xC3"), std::string("Caf"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
350
351 // Incomplete 3-byte sequence
352 {std::string("Hello\xE4\xB8"), std::string("Hello"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
353
354 // Incomplete 4-byte sequence
355 {std::string("Text\xF0\x9F\x98"), std::string("Text"), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
356
357 // Incomplete at very start
358 {std::string("\xE4\xBD"), std::string(""), COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT},
359 };
360
361 for (size_t i = 0; i < test_cases.size(); i++) {
362 const auto & tc = test_cases[i];
363 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
364
365 t.test(test_name, [&](testing &t) {
366 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
367 return p.json_string_content();
368 });
369
370 common_peg_parse_context ctx(tc.input, true);
371 auto result = parser.parse(ctx);
372
373 assert_result_equal(t, tc.expected_result, result.type);
374
375 if (result.need_more_input()) {
376 std::string matched = tc.input.substr(result.start, result.end - result.start);
377 t.assert_equal(tc.expected_text, matched);
378 }
379 });
380 }
381 });
382
383 t.test("malformed UTF-8", [](testing &t) {
384 std::vector<test_case> test_cases {
385 // Invalid UTF-8 bytes
386 {std::string("Hello\xFF\xFE"), "", COMMON_PEG_PARSE_RESULT_FAIL},
387
388 // Continuation byte without lead byte
389 {std::string("Hello\x80World"), "", COMMON_PEG_PARSE_RESULT_FAIL},
390
391 // Invalid continuation byte
392 {std::string("\xC3\x28"), "", COMMON_PEG_PARSE_RESULT_FAIL},
393
394 // Overlong encoding (security issue)
395 {std::string("\xC0\x80"), "", COMMON_PEG_PARSE_RESULT_FAIL},
396 };
397
398 for (size_t i = 0; i < test_cases.size(); i++) {
399 const auto & tc = test_cases[i];
400 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
401
402 t.test(test_name, [&](testing &t) {
403 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
404 return p.json_string_content();
405 });
406
407 common_peg_parse_context ctx(tc.input, false);
408 auto result = parser.parse(ctx);
409
410 assert_result_equal(t, tc.expected_result, result.type);
411 });
412 }
413 });
414
415 t.test("escape sequences with UTF-8", [](testing &t) {
416 std::vector<test_case> test_cases {
417 // Unicode escape sequence
418 {"Hello\\u0041\"", "Hello\\u0041", COMMON_PEG_PARSE_RESULT_SUCCESS},
419
420 // Mix of UTF-8 and escape sequences
421 {std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\n\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
422
423 // Escaped quote in UTF-8 string
424 {std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD\""), std::string("\xE4\xBD\xA0\\\"\xE5\xA5\xBD"), COMMON_PEG_PARSE_RESULT_SUCCESS},
425 };
426
427 for (size_t i = 0; i < test_cases.size(); i++) {
428 const auto & tc = test_cases[i];
429 std::string test_name = "case " + std::to_string(i) + ": " + hex_dump(tc.input);
430
431 t.test(test_name, [&](testing &t) {
432 auto parser = build_peg_parser([](common_peg_parser_builder& p) {
433 return p.sequence({p.json_string_content(), p.literal("\"")});
434 });
435
436 common_peg_parse_context ctx(tc.input, false);
437 auto result = parser.parse(ctx);
438
439 assert_result_equal(t, tc.expected_result, result.type);
440
441 if (result.success()) {
442 std::string matched = tc.input.substr(result.start, result.end - result.start - 1); // -1 to exclude closing quote
443 t.assert_equal(tc.expected_text, matched);
444 }
445 });
446 }
447 });
448 });
449}