llmnpc - llama.cpp/src/unicode.h

Path: llmnpc / llama.cpp / src / unicode.h (raw)
  1#pragma once
  2
  3#include <cstdint>
  4#include <string>
  5#include <vector>
  6
  7// TODO: reimplement this structure in endian-independent way
  8struct unicode_cpt_flags {
  9    enum {
 10        UNDEFINED       = 0x0001,
 11        NUMBER          = 0x0002,  // regex: \p{N}
 12        LETTER          = 0x0004,  // regex: \p{L}
 13        SEPARATOR       = 0x0008,  // regex: \p{Z}
 14        ACCENT_MARK     = 0x0010,  // regex: \p{M}
 15        PUNCTUATION     = 0x0020,  // regex: \p{P}
 16        SYMBOL          = 0x0040,  // regex: \p{S}
 17        CONTROL         = 0x0080,  // regex: \p{C}
 18        MASK_CATEGORIES = 0x00FF,
 19        WHITESPACE      = 0x0100,
 20        LOWERCASE       = 0x0200,
 21        UPPERCASE       = 0x0400,
 22        NFD             = 0x0800,
 23    };
 24
 25    // codepoint type
 26    uint16_t is_undefined   : 1;
 27    uint16_t is_number      : 1;  // regex: \p{N}
 28    uint16_t is_letter      : 1;  // regex: \p{L}
 29    uint16_t is_separator   : 1;  // regex: \p{Z}
 30    uint16_t is_accent_mark : 1;  // regex: \p{M}
 31    uint16_t is_punctuation : 1;  // regex: \p{P}
 32    uint16_t is_symbol      : 1;  // regex: \p{S}
 33    uint16_t is_control     : 1;  // regex: \p{C}
 34    // helper flags
 35    uint16_t is_whitespace  : 1;  // regex: \s
 36    uint16_t is_lowercase   : 1;
 37    uint16_t is_uppercase   : 1;
 38    uint16_t is_nfd         : 1;
 39
 40    // decode from uint16
 41    inline unicode_cpt_flags(const uint16_t flags = 0) {
 42#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 43        *reinterpret_cast<uint16_t*>(this) = flags;
 44#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 45        is_undefined   = (flags & UNDEFINED)   ? 1 : 0;
 46        is_number      = (flags & NUMBER)      ? 1 : 0;
 47        is_letter      = (flags & LETTER)      ? 1 : 0;
 48        is_separator   = (flags & SEPARATOR)   ? 1 : 0;
 49        is_accent_mark = (flags & ACCENT_MARK) ? 1 : 0;
 50        is_punctuation = (flags & PUNCTUATION) ? 1 : 0;
 51        is_symbol      = (flags & SYMBOL)      ? 1 : 0;
 52        is_control     = (flags & CONTROL)     ? 1 : 0;
 53        is_whitespace  = (flags & WHITESPACE)  ? 1 : 0;
 54        is_lowercase   = (flags & LOWERCASE)   ? 1 : 0;
 55        is_uppercase   = (flags & UPPERCASE)   ? 1 : 0;
 56        is_nfd         = (flags & NFD)         ? 1 : 0;
 57#else
 58#error Unexpected or undefined __BYTE_ORDER__
 59#endif
 60    }
 61
 62    inline uint16_t as_uint() const {
 63#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 64        return *reinterpret_cast<const uint16_t*>(this);
 65#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 66        uint16_t result =
 67              is_undefined   * UNDEFINED
 68            + is_number      * NUMBER
 69            + is_letter      * LETTER
 70            + is_separator   * SEPARATOR
 71            + is_accent_mark * ACCENT_MARK
 72            + is_punctuation * PUNCTUATION
 73            + is_symbol      * SYMBOL
 74            + is_control     * CONTROL
 75            + is_whitespace  * WHITESPACE
 76            + is_lowercase   * LOWERCASE
 77            + is_uppercase   * UPPERCASE
 78            + is_nfd         * NFD
 79            ;
 80
 81        return result;
 82#else
 83#error Unexpected or undefined __BYTE_ORDER__
 84#endif
 85    }
 86
 87    inline uint16_t category_flag() const {
 88        return this->as_uint() & MASK_CATEGORIES;
 89    }
 90};
 91
 92size_t unicode_len_utf8(char src);
 93
 94std::string unicode_cpt_to_utf8  (uint32_t cpt);
 95uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 96
 97std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 98
 99std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
100
101unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
102unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
103
104std::string unicode_byte_to_utf8(uint8_t byte);
105uint8_t     unicode_utf8_to_byte(const std::string & utf8);
106
107uint32_t unicode_tolower(uint32_t cpt);
108
109bool unicode_cpt_is_han(uint32_t cpt);
110
111std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);