summaryrefslogtreecommitdiff
path: root/llama.cpp/common/unicode.cpp
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/common/unicode.cpp
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/common/unicode.cpp')
-rw-r--r--llama.cpp/common/unicode.cpp64
1 files changed, 64 insertions, 0 deletions
diff --git a/llama.cpp/common/unicode.cpp b/llama.cpp/common/unicode.cpp
new file mode 100644
index 0000000..56ab0f4
--- /dev/null
+++ b/llama.cpp/common/unicode.cpp
@@ -0,0 +1,64 @@
+#include "unicode.h"
+
+// implementation adopted from src/unicode.cpp
+
+size_t utf8_sequence_length(unsigned char first_byte) {
+ const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+ uint8_t highbits = static_cast<uint8_t>(first_byte) >> 4;
+ return lookup[highbits];
+}
+
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset) {
+ if (offset >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+
+ // ASCII fast path
+ if (!(input[offset] & 0x80)) {
+ return utf8_parse_result(utf8_parse_result::SUCCESS, input[offset], 1);
+ }
+
+ // Invalid: continuation byte as first byte
+ if (!(input[offset] & 0x40)) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+
+ // 2-byte sequence
+ if (!(input[offset] & 0x20)) {
+ if (offset + 1 >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+ if ((input[offset + 1] & 0xc0) != 0x80) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+ auto result = ((input[offset] & 0x1f) << 6) | (input[offset + 1] & 0x3f);
+ return utf8_parse_result(utf8_parse_result::SUCCESS, result, 2);
+ }
+
+ // 3-byte sequence
+ if (!(input[offset] & 0x10)) {
+ if (offset + 2 >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+ if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+ auto result = ((input[offset] & 0x0f) << 12) | ((input[offset + 1] & 0x3f) << 6) | (input[offset + 2] & 0x3f);
+ return utf8_parse_result(utf8_parse_result::SUCCESS, result, 3);
+ }
+
+ // 4-byte sequence
+ if (!(input[offset] & 0x08)) {
+ if (offset + 3 >= input.size()) {
+ return utf8_parse_result(utf8_parse_result::INCOMPLETE);
+ }
+ if ((input[offset + 1] & 0xc0) != 0x80 || (input[offset + 2] & 0xc0) != 0x80 || (input[offset + 3] & 0xc0) != 0x80) {
+ return utf8_parse_result(utf8_parse_result::INVALID);
+ }
+ auto result = ((input[offset] & 0x07) << 18) | ((input[offset + 1] & 0x3f) << 12) | ((input[offset + 2] & 0x3f) << 6) | (input[offset + 3] & 0x3f);
+ return utf8_parse_result(utf8_parse_result::SUCCESS, result, 4);
+ }
+
+ // Invalid first byte
+ return utf8_parse_result(utf8_parse_result::INVALID);
+}