diff options
| author | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
|---|---|---|
| committer | Mitja Felicijan <mitja.felicijan@gmail.com> | 2026-02-12 20:57:17 +0100 |
| commit | b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch) | |
| tree | 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/scripts/gen-unicode-data.py | |
| download | llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz | |
Engage!
Diffstat (limited to 'llama.cpp/scripts/gen-unicode-data.py')
| -rw-r--r-- | llama.cpp/scripts/gen-unicode-data.py | 196 |
1 files changed, 196 insertions, 0 deletions
diff --git a/llama.cpp/scripts/gen-unicode-data.py b/llama.cpp/scripts/gen-unicode-data.py new file mode 100644 index 0000000..2d9bde0 --- /dev/null +++ b/llama.cpp/scripts/gen-unicode-data.py | |||
| @@ -0,0 +1,196 @@ | |||
| 1 | from __future__ import annotations | ||
| 2 | |||
| 3 | import array | ||
| 4 | import unicodedata | ||
| 5 | import requests | ||
| 6 | |||
| 7 | |||
| 8 | MAX_CODEPOINTS = 0x110000 | ||
| 9 | |||
| 10 | UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt" | ||
| 11 | |||
| 12 | |||
| 13 | # see https://www.unicode.org/L2/L1999/UnicodeData.html | ||
| 14 | def unicode_data_iter(): | ||
| 15 | res = requests.get(UNICODE_DATA_URL) | ||
| 16 | res.raise_for_status() | ||
| 17 | data = res.content.decode() | ||
| 18 | |||
| 19 | prev = [] | ||
| 20 | |||
| 21 | for line in data.splitlines(): | ||
| 22 | # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;; | ||
| 23 | line = line.split(";") | ||
| 24 | |||
| 25 | cpt = int(line[0], base=16) | ||
| 26 | assert cpt < MAX_CODEPOINTS | ||
| 27 | |||
| 28 | cpt_lower = int(line[-2] or "0", base=16) | ||
| 29 | assert cpt_lower < MAX_CODEPOINTS | ||
| 30 | |||
| 31 | cpt_upper = int(line[-3] or "0", base=16) | ||
| 32 | assert cpt_upper < MAX_CODEPOINTS | ||
| 33 | |||
| 34 | categ = line[2].strip() | ||
| 35 | assert len(categ) == 2 | ||
| 36 | |||
| 37 | bidir = line[4].strip() | ||
| 38 | assert len(categ) == 2 | ||
| 39 | |||
| 40 | name = line[1] | ||
| 41 | if name.endswith(", First>"): | ||
| 42 | prev = (cpt, cpt_lower, cpt_upper, categ, bidir) | ||
| 43 | continue | ||
| 44 | if name.endswith(", Last>"): | ||
| 45 | assert prev[1:] == (0, 0, categ, bidir) | ||
| 46 | for c in range(prev[0], cpt): | ||
| 47 | yield (c, cpt_lower, cpt_upper, categ, bidir) | ||
| 48 | |||
| 49 | yield (cpt, cpt_lower, cpt_upper, categ, bidir) | ||
| 50 | |||
| 51 | |||
| 52 | # see definition in unicode.h | ||
| 53 | CODEPOINT_FLAG_UNDEFINED = 0x0001 # | ||
| 54 | CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N} | ||
| 55 | CODEPOINT_FLAG_LETTER = 0x0004 # \p{L} | ||
| 56 | CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z} | ||
| 57 | CODEPOINT_FLAG_MARK = 0x0010 # \p{M} | ||
| 58 | CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P} | ||
| 59 | CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S} | ||
| 60 | CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C} | ||
| 61 | |||
| 62 | UNICODE_CATEGORY_TO_FLAG = { | ||
| 63 | "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined | ||
| 64 | "Cc": CODEPOINT_FLAG_CONTROL, # Control | ||
| 65 | "Cf": CODEPOINT_FLAG_CONTROL, # Format | ||
| 66 | "Co": CODEPOINT_FLAG_CONTROL, # Private Use | ||
| 67 | "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate | ||
| 68 | "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter | ||
| 69 | "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter | ||
| 70 | "Lo": CODEPOINT_FLAG_LETTER, # Other Letter | ||
| 71 | "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter | ||
| 72 | "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter | ||
| 73 | "L&": CODEPOINT_FLAG_LETTER, # Cased Letter | ||
| 74 | "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark | ||
| 75 | "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark | ||
| 76 | "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark | ||
| 77 | "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number | ||
| 78 | "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number | ||
| 79 | "No": CODEPOINT_FLAG_NUMBER, # Other Number | ||
| 80 | "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation | ||
| 81 | "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation | ||
| 82 | "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation | ||
| 83 | "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation | ||
| 84 | "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation | ||
| 85 | "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation | ||
| 86 | "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation | ||
| 87 | "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol | ||
| 88 | "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol | ||
| 89 | "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol | ||
| 90 | "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol | ||
| 91 | "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator | ||
| 92 | "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator | ||
| 93 | "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator | ||
| 94 | } | ||
| 95 | |||
| 96 | |||
| 97 | codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS | ||
| 98 | table_whitespace = [] | ||
| 99 | table_lowercase = [] | ||
| 100 | table_uppercase = [] | ||
| 101 | table_nfd = [] | ||
| 102 | |||
| 103 | for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter(): | ||
| 104 | # convert codepoint to unicode character | ||
| 105 | char = chr(cpt) | ||
| 106 | |||
| 107 | # codepoint category flags | ||
| 108 | codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ] | ||
| 109 | |||
| 110 | # lowercase conversion | ||
| 111 | if cpt_lower: | ||
| 112 | table_lowercase.append((cpt, cpt_lower)) | ||
| 113 | |||
| 114 | # uppercase conversion | ||
| 115 | if cpt_upper: | ||
| 116 | table_uppercase.append((cpt, cpt_upper)) | ||
| 117 | |||
| 118 | # NFD normalization | ||
| 119 | norm = ord(unicodedata.normalize('NFD', char)[0]) | ||
| 120 | if cpt != norm: | ||
| 121 | table_nfd.append((cpt, norm)) | ||
| 122 | |||
| 123 | |||
| 124 | # whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt | ||
| 125 | table_whitespace.extend(range(0x0009, 0x000D + 1)) | ||
| 126 | table_whitespace.extend(range(0x2000, 0x200A + 1)) | ||
| 127 | table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]) | ||
| 128 | |||
| 129 | |||
| 130 | # sort by codepoint | ||
| 131 | table_whitespace.sort() | ||
| 132 | table_lowercase.sort() | ||
| 133 | table_uppercase.sort() | ||
| 134 | table_nfd.sort() | ||
| 135 | |||
| 136 | |||
| 137 | # group ranges with same flags | ||
| 138 | ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])] # start, flags | ||
| 139 | for codepoint, flags in enumerate(codepoint_flags): | ||
| 140 | if flags != ranges_flags[-1][1]: | ||
| 141 | ranges_flags.append((codepoint, flags)) | ||
| 142 | ranges_flags.append((MAX_CODEPOINTS, 0x0000)) | ||
| 143 | |||
| 144 | |||
| 145 | # group ranges with same nfd | ||
| 146 | ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd | ||
| 147 | for codepoint, norm in table_nfd: | ||
| 148 | start = ranges_nfd[-1][0] | ||
| 149 | if ranges_nfd[-1] != (start, codepoint - 1, norm): | ||
| 150 | ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below | ||
| 151 | start = codepoint | ||
| 152 | ranges_nfd[-1] = (start, codepoint, norm) | ||
| 153 | |||
| 154 | |||
| 155 | # Generate 'unicode-data.cpp': | ||
| 156 | # python ./scripts//gen-unicode-data.py > unicode-data.cpp | ||
| 157 | |||
| 158 | def out(line=""): | ||
| 159 | print(line, end='\n') # noqa | ||
| 160 | |||
| 161 | |||
| 162 | out("""\ | ||
| 163 | // generated with scripts/gen-unicode-data.py | ||
| 164 | |||
| 165 | #include "unicode-data.h" | ||
| 166 | |||
| 167 | #include <cstdint> | ||
| 168 | #include <vector> | ||
| 169 | #include <unordered_map> | ||
| 170 | #include <unordered_set> | ||
| 171 | """) | ||
| 172 | |||
| 173 | out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1") | ||
| 174 | for codepoint, flags in ranges_flags: | ||
| 175 | out("{0x%06X, 0x%04X}," % (codepoint, flags)) | ||
| 176 | out("};\n") | ||
| 177 | |||
| 178 | out("const std::unordered_set<uint32_t> unicode_set_whitespace = {") | ||
| 179 | for codepoint in table_whitespace: | ||
| 180 | out("0x%06X," % codepoint) | ||
| 181 | out("};\n") | ||
| 182 | |||
| 183 | out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {") | ||
| 184 | for tuple_lw in table_lowercase: | ||
| 185 | out("{0x%06X, 0x%06X}," % tuple_lw) | ||
| 186 | out("};\n") | ||
| 187 | |||
| 188 | out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {") | ||
| 189 | for tuple_up in table_uppercase: | ||
| 190 | out("{0x%06X, 0x%06X}," % tuple_up) | ||
| 191 | out("};\n") | ||
| 192 | |||
| 193 | out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd") | ||
| 194 | for triple in ranges_nfd: | ||
| 195 | out("{0x%06X, 0x%06X, 0x%06X}," % triple) | ||
| 196 | out("};\n") | ||
