1#include "unicode.h"
2
3// implementation adopted from src/unicode.cpp
4
5size_t utf8_sequence_length(unsigned char first_byte) {
6 const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
7 uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
8 return lookup[highbits];
9}
10
11utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
12 if (offset >= input.size()) {
13 return utf8_parse_result(utf8_parse_result::INCOMPLETE);
14 }
15
16 // ASCII fast path
17 if (!(input[offset] & 0x80)) {
18 return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
19 }
20
21 // Invalid: continuation byte as first byte
22 if (!(input[offset] & 0x40)) {
23 return utf8_parse_result(utf8_parse_result::INVALID);
24 }
25
26 // 2-byte sequence
27 if (!(input[offset] & 0x20)) {
28 if (offset + 1 >= input.size()) {
29 return utf8_parse_result(utf8_parse_result::INCOMPLETE);
30 }
31 if ((input[offset + 1] & 0xc0) != 0x80) {
32 return utf8_parse_result(utf8_parse_result::INVALID);
33 }
34 auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
35 return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
36 }
37
38 // 3-byte sequence
39 if (!(input[offset] & 0x10)) {
40 if (offset + 2 >= input.size()) {
41 return utf8_parse_result(utf8_parse_result::INCOMPLETE);
42 }
43 if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
44 return utf8_parse_result(utf8_parse_result::INVALID);
45 }
46 auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
47 return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
48 }
49
50 // 4-byte sequence
51 if (!(input[offset] & 0x08)) {
52 if (offset + 3 >= input.size()) {
53 return utf8_parse_result(utf8_parse_result::INCOMPLETE);
54 }
55 if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
56 return utf8_parse_result(utf8_parse_result::INVALID);
57 }
58 auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
59 return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
60 }
61
62 // Invalid first byte
63 return utf8_parse_result(utf8_parse_result::INVALID);
64}