summaryrefslogtreecommitdiff
path: root/llama.cpp/common/unicode.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/common/unicode.h')
-rw-r--r--llama.cpp/common/unicode.h22
1 files changed, 22 insertions, 0 deletions
diff --git a/llama.cpp/common/unicode.h b/llama.cpp/common/unicode.h
new file mode 100644
index 0000000..9d9e8e1
--- /dev/null
+++ b/llama.cpp/common/unicode.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+
+// UTF-8 parsing utilities for streaming-aware unicode support
+
+struct utf8_parse_result {
+ uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS)
+ size_t bytes_consumed; // How many bytes this codepoint uses (1-4)
+ enum status { SUCCESS, INCOMPLETE, INVALID } status;
+
+ utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0)
+ : codepoint(cp), bytes_consumed(bytes), status(s) {}
+};
+
+// Determine the expected length of a UTF-8 sequence from its first byte
+// Returns 0 for invalid first bytes
+size_t utf8_sequence_length(unsigned char first_byte);
+
+// Parse a single UTF-8 codepoint from input
+utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset);