diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/common/unicode.cpp | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/common/unicode.cpp')
| -rw-r--r-- | llama.cpp/common/unicode.cpp | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/llama.cpp/common/unicode.cpp b/llama.cpp/common/unicode.cpp new file mode 100644 index 0000000..56ab0f4 --- /dev/null +++ b/llama.cpp/common/unicode.cpp @@ -0,0 +1,64 @@ +#include "unicode.h" + +// implementation adopted from src/unicode.cpp + +size_t utf8_sequence_length(unsigned char first_byte) { + const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; + uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4; + return lookup[highbits]; +} + +utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) { + if (offset >= input.size()) { + return utf8_parse_result(utf8_parse_result::INCOMPLETE); + } + + // ASCII fast path + if (!(input[offset] & 0x80)) { + return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1); + } + + // Invalid: continuation byte as first byte + if (!(input[offset] & 0x40)) { + return utf8_parse_result(utf8_parse_result::INVALID); + } + + // 2-byte sequence + if (!(input[offset] & 0x20)) { + if (offset + 1 >= input.size()) { + return utf8_parse_result(utf8_parse_result::INCOMPLETE); + } + if ((input[offset + 1] & 0xc0) != 0x80) { + return utf8_parse_result(utf8_parse_result::INVALID); + } + auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f); + return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2); + } + + // 3-byte sequence + if (!(input[offset] & 0x10)) { + if (offset + 2 >= input.size()) { + return utf8_parse_result(utf8_parse_result::INCOMPLETE); + } + if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) { + return utf8_parse_result(utf8_parse_result::INVALID); + } + auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f); + return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3); + } + + // 4-byte sequence + if (!(input[offset] & 0x08)) { + if (offset + 3 >= input.size()) { + return utf8_parse_result(utf8_parse_result::INCOMPLETE); + } + if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) { + return utf8_parse_result(utf8_parse_result::INVALID); + } + auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f); + return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4); + } + + // Invalid first byte + return utf8_parse_result(utf8_parse_result::INVALID); +} |
