From b333b06772c89d96aacb5490d6a219fba7c09cc6 Mon Sep 17 00:00:00 2001 From: Mitja Felicijan Date: Thu, 12 Feb 2026 20:57:17 +0100 Subject: Engage! --- llama.cpp/common/unicode.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 llama.cpp/common/unicode.h (limited to 'llama.cpp/common/unicode.h') diff --git a/llama.cpp/common/unicode.h b/llama.cpp/common/unicode.h new file mode 100644 index 0000000..9d9e8e1 --- /dev/null +++ b/llama.cpp/common/unicode.h @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +// UTF-8 parsing utilities for streaming-aware unicode support + +struct utf8_parse_result { + uint32_t codepoint; // Decoded codepoint (only valid if status == SUCCESS) + size_t bytes_consumed; // How many bytes this codepoint uses (1-4) + enum status { SUCCESS, INCOMPLETE, INVALID } status; + + utf8_parse_result(enum status s, uint32_t cp = 0, size_t bytes = 0) + : codepoint(cp), bytes_consumed(bytes), status(s) {} +}; + +// Determine the expected length of a UTF-8 sequence from its first byte +// Returns 0 for invalid first bytes +size_t utf8_sequence_length(unsigned char first_byte); + +// Parse a single UTF-8 codepoint from input +utf8_parse_result parse_utf8_codepoint(std::string_view input, size_t offset); -- cgit v1.2.3