Engage!

author: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
committer: Mitja Felicijan <mitja.felicijan@gmail.com> 2026-02-12 20:57:17 +0100
commit: b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree: 211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/scripts/gen-unicode-data.py
download: llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz
1 files changed, 196 insertions, 0 deletions
diff --git a/llama.cpp/scripts/gen-unicode-data.py b/llama.cpp/scripts/gen-unicode-data.py
new file mode 100644
index 0000000..2d9bde0
--- /dev/null
+++ b/llama.cpp/scripts/gen-unicode-data.py
@@ -0,0 +1,196 @@
+from __future__ import annotations
+import array
+import unicodedata
+import requests
+MAX_CODEPOINTS = 0x110000
+UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
+# see https://www.unicode.org/L2/L1999/UnicodeData.html
+def unicode_data_iter():
+    res = requests.get(UNICODE_DATA_URL)
+    res.raise_for_status()
+    data = res.content.decode()
+    prev = []
+    for line in data.splitlines():
+        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
+        line = line.split(";")
+        cpt = int(line[0], base=16)
+        assert cpt < MAX_CODEPOINTS
+        cpt_lower = int(line[-2] or "0", base=16)
+        assert cpt_lower < MAX_CODEPOINTS
+        cpt_upper = int(line[-3] or "0", base=16)
+        assert cpt_upper < MAX_CODEPOINTS
+        categ = line[2].strip()
+        assert len(categ) == 2
+        bidir = line[4].strip()
+        assert len(categ) == 2
+        name = line[1]
+        if name.endswith(", First>"):
+            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
+            continue
+        if name.endswith(", Last>"):
+            assert prev[1:] == (0, 0, categ, bidir)
+            for c in range(prev[0], cpt):
+                yield (c, cpt_lower, cpt_upper, categ, bidir)
+        yield (cpt, cpt_lower, cpt_upper, categ, bidir)
+# see definition in unicode.h
+CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
+CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
+CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
+CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
+CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
+CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
+CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
+CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
+UNICODE_CATEGORY_TO_FLAG = {
+    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
+    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
+    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
+    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
+    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
+    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
+    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
+    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
+    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
+    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
+    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
+    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
+    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
+    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
+    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
+    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
+    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
+    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
+    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
+    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
+    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
+    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
+    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
+    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
+    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
+    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
+    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
+    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
+    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
+    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
+    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
+}
+codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
+table_whitespace = []
+table_lowercase = []
+table_uppercase = []
+table_nfd = []
+for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
+    # convert codepoint to unicode character
+    char = chr(cpt)
+    # codepoint category flags
+    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
+    # lowercase conversion
+    if cpt_lower:
+        table_lowercase.append((cpt, cpt_lower))
+    # uppercase conversion
+    if cpt_upper:
+        table_uppercase.append((cpt, cpt_upper))
+    # NFD normalization
+    norm = ord(unicodedata.normalize('NFD', char)[0])
+    if cpt != norm:
+        table_nfd.append((cpt, norm))
+# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+table_whitespace.extend(range(0x0009, 0x000D + 1))
+table_whitespace.extend(range(0x2000, 0x200A + 1))
+table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
+# sort by codepoint
+table_whitespace.sort()
+table_lowercase.sort()
+table_uppercase.sort()
+table_nfd.sort()
+# group ranges with same flags
+ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])]  # start, flags
+for codepoint, flags in enumerate(codepoint_flags):
+    if flags != ranges_flags[-1][1]:
+        ranges_flags.append((codepoint, flags))
+ranges_flags.append((MAX_CODEPOINTS, 0x0000))
+# group ranges with same nfd
+ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)]  # start, last, nfd
+for codepoint, norm in table_nfd:
+    start = ranges_nfd[-1][0]
+    if ranges_nfd[-1] != (start, codepoint - 1, norm):
+        ranges_nfd.append(None)  # type: ignore[arg-type]  # dummy, will be replaced below
+        start = codepoint
+    ranges_nfd[-1] = (start, codepoint, norm)
+# Generate 'unicode-data.cpp':
+#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
+def out(line=""):
+    print(line, end='\n')  # noqa
+out("""\
+// generated with scripts/gen-unicode-data.py
+#include "unicode-data.h"
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+""")
+out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
+for codepoint, flags in ranges_flags:
+    out("{0x%06X, 0x%04X}," % (codepoint, flags))
+out("};\n")
+out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
+for codepoint in table_whitespace:
+    out("0x%06X," % codepoint)
+out("};\n")
+out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
+for tuple_lw in table_lowercase:
+    out("{0x%06X, 0x%06X}," % tuple_lw)
+out("};\n")
+out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
+for tuple_up in table_uppercase:
+    out("{0x%06X, 0x%06X}," % tuple_up)
+out("};\n")
+out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
+for triple in ranges_nfd:
+    out("{0x%06X, 0x%06X, 0x%06X}," % triple)
+out("};\n")
author	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
committer	Mitja Felicijan <mitja.felicijan@gmail.com>	2026-02-12 20:57:17 +0100
commit	b333b06772c89d96aacb5490d6a219fba7c09cc6 (patch)
tree	211df60083a5946baa2ed61d33d8121b7e251b06 /llama.cpp/scripts/gen-unicode-data.py
download	llmnpc-b333b06772c89d96aacb5490d6a219fba7c09cc6.tar.gz

diff --git a/llama.cpp/scripts/gen-unicode-data.py b/llama.cpp/scripts/gen-unicode-data.py new file mode 100644 index 0000000..2d9bde0 --- /dev/null +++ b/llama.cpp/scripts/gen-unicode-data.py
@@ -0,0 +1,196 @@
	1	from __future__ import annotations
	2
	3	import array
	4	import unicodedata
	5	import requests
	6
	7
	8	MAX_CODEPOINTS = 0x110000
	9
	10	UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
	11
	12
	13	# see https://www.unicode.org/L2/L1999/UnicodeData.html
	14	def unicode_data_iter():
	15	res = requests.get(UNICODE_DATA_URL)
	16	res.raise_for_status()
	17	data = res.content.decode()
	18
	19	prev = []
	20
	21	for line in data.splitlines():
	22	# ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
	23	line = line.split(";")
	24
	25	cpt = int(line[0], base=16)
	26	assert cpt < MAX_CODEPOINTS
	27
	28	cpt_lower = int(line[-2] or "0", base=16)
	29	assert cpt_lower < MAX_CODEPOINTS
	30
	31	cpt_upper = int(line[-3] or "0", base=16)
	32	assert cpt_upper < MAX_CODEPOINTS
	33
	34	categ = line[2].strip()
	35	assert len(categ) == 2
	36
	37	bidir = line[4].strip()
	38	assert len(categ) == 2
	39
	40	name = line[1]
	41	if name.endswith(", First>"):
	42	prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
	43	continue
	44	if name.endswith(", Last>"):
	45	assert prev[1:] == (0, 0, categ, bidir)
	46	for c in range(prev[0], cpt):
	47	yield (c, cpt_lower, cpt_upper, categ, bidir)
	48
	49	yield (cpt, cpt_lower, cpt_upper, categ, bidir)
	50
	51
	52	# see definition in unicode.h
	53	CODEPOINT_FLAG_UNDEFINED = 0x0001 #
	54	CODEPOINT_FLAG_NUMBER = 0x0002 # \p{N}
	55	CODEPOINT_FLAG_LETTER = 0x0004 # \p{L}
	56	CODEPOINT_FLAG_SEPARATOR = 0x0008 # \p{Z}
	57	CODEPOINT_FLAG_MARK = 0x0010 # \p{M}
	58	CODEPOINT_FLAG_PUNCTUATION = 0x0020 # \p{P}
	59	CODEPOINT_FLAG_SYMBOL = 0x0040 # \p{S}
	60	CODEPOINT_FLAG_CONTROL = 0x0080 # \p{C}
	61
	62	UNICODE_CATEGORY_TO_FLAG = {
	63	"Cn": CODEPOINT_FLAG_UNDEFINED, # Undefined
	64	"Cc": CODEPOINT_FLAG_CONTROL, # Control
	65	"Cf": CODEPOINT_FLAG_CONTROL, # Format
	66	"Co": CODEPOINT_FLAG_CONTROL, # Private Use
	67	"Cs": CODEPOINT_FLAG_CONTROL, # Surrrogate
	68	"Ll": CODEPOINT_FLAG_LETTER, # Lowercase Letter
	69	"Lm": CODEPOINT_FLAG_LETTER, # Modifier Letter
	70	"Lo": CODEPOINT_FLAG_LETTER, # Other Letter
	71	"Lt": CODEPOINT_FLAG_LETTER, # Titlecase Letter
	72	"Lu": CODEPOINT_FLAG_LETTER, # Uppercase Letter
	73	"L&": CODEPOINT_FLAG_LETTER, # Cased Letter
	74	"Mc": CODEPOINT_FLAG_MARK, # Spacing Mark
	75	"Me": CODEPOINT_FLAG_MARK, # Enclosing Mark
	76	"Mn": CODEPOINT_FLAG_MARK, # Nonspacing Mark
	77	"Nd": CODEPOINT_FLAG_NUMBER, # Decimal Number
	78	"Nl": CODEPOINT_FLAG_NUMBER, # Letter Number
	79	"No": CODEPOINT_FLAG_NUMBER, # Other Number
	80	"Pc": CODEPOINT_FLAG_PUNCTUATION, # Connector Punctuation
	81	"Pd": CODEPOINT_FLAG_PUNCTUATION, # Dash Punctuation
	82	"Pe": CODEPOINT_FLAG_PUNCTUATION, # Close Punctuation
	83	"Pf": CODEPOINT_FLAG_PUNCTUATION, # Final Punctuation
	84	"Pi": CODEPOINT_FLAG_PUNCTUATION, # Initial Punctuation
	85	"Po": CODEPOINT_FLAG_PUNCTUATION, # Other Punctuation
	86	"Ps": CODEPOINT_FLAG_PUNCTUATION, # Open Punctuation
	87	"Sc": CODEPOINT_FLAG_SYMBOL, # Currency Symbol
	88	"Sk": CODEPOINT_FLAG_SYMBOL, # Modifier Symbol
	89	"Sm": CODEPOINT_FLAG_SYMBOL, # Math Symbol
	90	"So": CODEPOINT_FLAG_SYMBOL, # Other Symbol
	91	"Zl": CODEPOINT_FLAG_SEPARATOR, # Line Separator
	92	"Zp": CODEPOINT_FLAG_SEPARATOR, # Paragraph Separator
	93	"Zs": CODEPOINT_FLAG_SEPARATOR, # Space Separator
	94	}
	95
	96
	97	codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
	98	table_whitespace = []
	99	table_lowercase = []
	100	table_uppercase = []
	101	table_nfd = []
	102
	103	for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
	104	# convert codepoint to unicode character
	105	char = chr(cpt)
	106
	107	# codepoint category flags
	108	codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
	109
	110	# lowercase conversion
	111	if cpt_lower:
	112	table_lowercase.append((cpt, cpt_lower))
	113
	114	# uppercase conversion
	115	if cpt_upper:
	116	table_uppercase.append((cpt, cpt_upper))
	117
	118	# NFD normalization
	119	norm = ord(unicodedata.normalize('NFD', char)[0])
	120	if cpt != norm:
	121	table_nfd.append((cpt, norm))
	122
	123
	124	# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
	125	table_whitespace.extend(range(0x0009, 0x000D + 1))
	126	table_whitespace.extend(range(0x2000, 0x200A + 1))
	127	table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
	128
	129
	130	# sort by codepoint
	131	table_whitespace.sort()
	132	table_lowercase.sort()
	133	table_uppercase.sort()
	134	table_nfd.sort()
	135
	136
	137	# group ranges with same flags
	138	ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])] # start, flags
	139	for codepoint, flags in enumerate(codepoint_flags):
	140	if flags != ranges_flags[-1][1]:
	141	ranges_flags.append((codepoint, flags))
	142	ranges_flags.append((MAX_CODEPOINTS, 0x0000))
	143
	144
	145	# group ranges with same nfd
	146	ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)] # start, last, nfd
	147	for codepoint, norm in table_nfd:
	148	start = ranges_nfd[-1][0]
	149	if ranges_nfd[-1] != (start, codepoint - 1, norm):
	150	ranges_nfd.append(None) # type: ignore[arg-type] # dummy, will be replaced below
	151	start = codepoint
	152	ranges_nfd[-1] = (start, codepoint, norm)
	153
	154
	155	# Generate 'unicode-data.cpp':
	156	# python ./scripts//gen-unicode-data.py > unicode-data.cpp
	157
	158	def out(line=""):
	159	print(line, end='\n') # noqa
	160
	161
	162	out("""\
	163	// generated with scripts/gen-unicode-data.py
	164
	165	#include "unicode-data.h"
	166
	167	#include <cstdint>
	168	#include <vector>
	169	#include <unordered_map>
	170	#include <unordered_set>
	171	""")
	172
	173	out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
	174	for codepoint, flags in ranges_flags:
	175	out("{0x%06X, 0x%04X}," % (codepoint, flags))
	176	out("};\n")
	177
	178	out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
	179	for codepoint in table_whitespace:
	180	out("0x%06X," % codepoint)
	181	out("};\n")
	182
	183	out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
	184	for tuple_lw in table_lowercase:
	185	out("{0x%06X, 0x%06X}," % tuple_lw)
	186	out("};\n")
	187
	188	out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
	189	for tuple_up in table_uppercase:
	190	out("{0x%06X, 0x%06X}," % tuple_up)
	191	out("};\n")
	192
	193	out("const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd")
	194	for triple in ranges_nfd:
	195	out("{0x%06X, 0x%06X, 0x%06X}," % triple)
	196	out("};\n")