summaryrefslogtreecommitdiff
path: root/llama.cpp/scripts/gen-unicode-data.py
diff options
context:
space:
mode:
authorMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
committerMitja Felicijan <mitja.felicijan@gmail.com>2026-02-12 20:57:17 +0100
commitb333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/scripts/gen-unicode-data.py
downloadllmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
Engage!
Diffstat (limited to 'llama.cpp/scripts/gen-unicode-data.py')
-rw-r--r--llama.cpp/scripts/gen-unicode-data.py196
1 files changed, 196 insertions, 0 deletions
diff --git a/llama.cpp/scripts/gen-unicode-data.py b/llama.cpp/scripts/gen-unicode-data.py
new file mode 100644
index 0000000..2d9bde0
--- /dev/null
+++ b/llama.cpp/scripts/gen-unicode-data.py
@@ -0,0 +1,196 @@
1from __future__ import annotations
2
3import array
4import unicodedata
5import requests
6
7
8MAX_CODEPOINTS = 0x110000
9
10UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
11
12
13# see https://www.unicode.org/L2/L1999/UnicodeData.html
14def unicode_data_iter():
15 res = requests.get(UNICODE_DATA_URL)
16 res.raise_for_status()
17 data = res.content.decode()
18
19 prev = []
20
21 for line in data.splitlines():
22 # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
23 line = line.split(";")
24
25 cpt = int(line[0], base=16)
26 assert cpt < MAX_CODEPOINTS
27
28 cpt_lower = int(line[-2] or "0", base=16)
29 assert cpt_lower < MAX_CODEPOINTS
30
31 cpt_upper = int(line[-3] or "0", base=16)
32 assert cpt_upper < MAX_CODEPOINTS
33
34 categ = line[2].strip()
35 assert len(categ) == 2
36
37 bidir = line[4].strip()
38 assert len(categ) == 2
39
40 name = line[1]
41 if name.endswith(", First>"):
42 prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
43 continue
44 if name.endswith(", Last>"):
45 assert prev[1:] == (0, 0, categ, bidir)
46 for c in range(prev[0], cpt):
47 yield (c, cpt_lower, cpt_upper, categ, bidir)
48
49 yield (cpt, cpt_lower, cpt_upper, categ, bidir)
50
51
52# see definition in unicode.h
53CODEPOINT_FLAG_UNDEFINED = 0x0001 #
54CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N}
55CODEPOINT_FLAG_LETTER = 0x0004 # \p{L}
56CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z}
57CODEPOINT_FLAG_MARK = 0x0010 # \p{M}
58CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P}
59CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S}
60CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C}
61
62UNICODE_CATEGORY_TO_FLAG = {
63 "Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined
64 "Cc": CODEPOINT_FLAG_CONTROL, # Control
65 "Cf": CODEPOINT_FLAG_CONTROL, # Format
66 "Co": CODEPOINT_FLAG_CONTROL, # Private Use
67 "Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate
68 "Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter
69 "Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter
70 "Lo": CODEPOINT_FLAG_LETTER, # Other Letter
71 "Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter
72 "Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter
73 "L&": CODEPOINT_FLAG_LETTER, # Cased Letter
74 "Mc": CODEPOINT_FLAG_MARK, # Spacing Mark
75 "Me": CODEPOINT_FLAG_MARK, # Enclosing Mark
76 "Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark
77 "Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number
78 "Nl": CODEPOINT_FLAG_NUMBER, # Letter Number
79 "No": CODEPOINT_FLAG_NUMBER, # Other Number
80 "Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation
81 "Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation
82 "Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation
83 "Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation
84 "Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation
85 "Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation
86 "Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation
87 "Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol
88 "Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol
89 "Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol
90 "So": CODEPOINT_FLAG_SYMBOL, # Other Symbol
91 "Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator
92 "Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator
93 "Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator
94}
95
96
97codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
98table_whitespace = []
99table_lowercase = []
100table_uppercase = []
101table_nfd = []
102
103for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
104 # convert codepoint to unicode character
105 char = chr(cpt)
106
107 # codepoint category flags
108 codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
109
110 # lowercase conversion
111 if cpt_lower:
112 table_lowercase.append((cpt, cpt_lower))
113
114 # uppercase conversion
115 if cpt_upper:
116 table_uppercase.append((cpt, cpt_upper))
117
118 # NFD normalization
119 norm = ord(unicodedata.normalize('NFD', char)[0])
120 if cpt != norm:
121 table_nfd.append((cpt, norm))
122
123
124# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
125table_whitespace.extend(range(0x0009, 0x000D + 1))
126table_whitespace.extend(range(0x2000, 0x200A + 1))
127table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
128
129
130# sort by codepoint
131table_whitespace.sort()
132table_lowercase.sort()
133table_uppercase.sort()
134table_nfd.sort()
135
136
137# group ranges with same flags
138ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])] # start, flags
139for codepoint, flags in enumerate(codepoint_flags):
140 if flags != ranges_flags[-1][1]:
141 ranges_flags.append((codepoint, flags))
142ranges_flags.append((MAX_CODEPOINTS, 0x0000))
143
144
145# group ranges with same nfd
146ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd
147for codepoint, norm in table_nfd:
148 start = ranges_nfd[-1][0]
149 if ranges_nfd[-1] != (start, codepoint - 1, norm):
150 ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below
151 start = codepoint
152 ranges_nfd[-1] = (start, codepoint, norm)
153
154
155# Generate 'unicode-data.cpp':
156# python ./scripts//gen-unicode-data.py > unicode-data.cpp
157
158def out(line=""):
159 print(line, end='\n') # noqa
160
161
162out("""\
163// generated with scripts/gen-unicode-data.py
164
165#include "unicode-data.h"
166
167#include <cstdint>
168#include <vector>
169#include <unordered_map>
170#include <unordered_set>
171""")
172
173out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
174for codepoint, flags in ranges_flags:
175 out("{0x%06X, 0x%04X}," % (codepoint, flags))
176out("};\n")
177
178out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
179for codepoint in table_whitespace:
180 out("0x%06X," % codepoint)
181out("};\n")
182
183out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
184for tuple_lw in table_lowercase:
185 out("{0x%06X, 0x%06X}," % tuple_lw)
186out("};\n")
187
188out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
189for tuple_up in table_uppercase:
190 out("{0x%06X, 0x%06X}," % tuple_up)
191out("};\n")
192
193out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd")
194for triple in ranges_nfd:
195 out("{0x%06X, 0x%06X, 0x%06X}," % triple)
196out("};\n")