llmnpc - llama.cpp/scripts/gen-unicode-data.py

Path: llmnpc / llama.cpp / scripts / gen-unicode-data.py (raw)
  1from __future__ import annotations
  2
  3import array
  4import unicodedata
  5import requests
  6
  7
  8MAX_CODEPOINTS = 0x110000
  9
 10UNICODE_DATA_URL = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
 11
 12
 13# see https://www.unicode.org/L2/L1999/UnicodeData.html
 14def unicode_data_iter():
 15    res = requests.get(UNICODE_DATA_URL)
 16    res.raise_for_status()
 17    data = res.content.decode()
 18
 19    prev = []
 20
 21    for line in data.splitlines():
 22        # ej: 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
 23        line = line.split(";")
 24
 25        cpt = int(line[0], base=16)
 26        assert cpt < MAX_CODEPOINTS
 27
 28        cpt_lower = int(line[-2] or "0", base=16)
 29        assert cpt_lower < MAX_CODEPOINTS
 30
 31        cpt_upper = int(line[-3] or "0", base=16)
 32        assert cpt_upper < MAX_CODEPOINTS
 33
 34        categ = line[2].strip()
 35        assert len(categ) == 2
 36
 37        bidir = line[4].strip()
 38        assert len(categ) == 2
 39
 40        name = line[1]
 41        if name.endswith(", First>"):
 42            prev = (cpt, cpt_lower, cpt_upper, categ, bidir)
 43            continue
 44        if name.endswith(", Last>"):
 45            assert prev[1:] == (0, 0, categ, bidir)
 46            for c in range(prev[0], cpt):
 47                yield (c, cpt_lower, cpt_upper, categ, bidir)
 48
 49        yield (cpt, cpt_lower, cpt_upper, categ, bidir)
 50
 51
 52# see definition in unicode.h
 53CODEPOINT_FLAG_UNDEFINED   = 0x0001  #
 54CODEPOINT_FLAG_NUMBER      = 0x0002  # \p{N}
 55CODEPOINT_FLAG_LETTER      = 0x0004  # \p{L}
 56CODEPOINT_FLAG_SEPARATOR   = 0x0008  # \p{Z}
 57CODEPOINT_FLAG_MARK        = 0x0010  # \p{M}
 58CODEPOINT_FLAG_PUNCTUATION = 0x0020  # \p{P}
 59CODEPOINT_FLAG_SYMBOL      = 0x0040  # \p{S}
 60CODEPOINT_FLAG_CONTROL     = 0x0080  # \p{C}
 61
 62UNICODE_CATEGORY_TO_FLAG = {
 63    "Cn": CODEPOINT_FLAG_UNDEFINED,    # Undefined
 64    "Cc": CODEPOINT_FLAG_CONTROL,      # Control
 65    "Cf": CODEPOINT_FLAG_CONTROL,      # Format
 66    "Co": CODEPOINT_FLAG_CONTROL,      # Private Use
 67    "Cs": CODEPOINT_FLAG_CONTROL,      # Surrrogate
 68    "Ll": CODEPOINT_FLAG_LETTER,       # Lowercase Letter
 69    "Lm": CODEPOINT_FLAG_LETTER,       # Modifier Letter
 70    "Lo": CODEPOINT_FLAG_LETTER,       # Other Letter
 71    "Lt": CODEPOINT_FLAG_LETTER,       # Titlecase Letter
 72    "Lu": CODEPOINT_FLAG_LETTER,       # Uppercase Letter
 73    "L&": CODEPOINT_FLAG_LETTER,       # Cased Letter
 74    "Mc": CODEPOINT_FLAG_MARK,         # Spacing Mark
 75    "Me": CODEPOINT_FLAG_MARK,         # Enclosing Mark
 76    "Mn": CODEPOINT_FLAG_MARK,         # Nonspacing Mark
 77    "Nd": CODEPOINT_FLAG_NUMBER,       # Decimal Number
 78    "Nl": CODEPOINT_FLAG_NUMBER,       # Letter Number
 79    "No": CODEPOINT_FLAG_NUMBER,       # Other Number
 80    "Pc": CODEPOINT_FLAG_PUNCTUATION,  # Connector Punctuation
 81    "Pd": CODEPOINT_FLAG_PUNCTUATION,  # Dash Punctuation
 82    "Pe": CODEPOINT_FLAG_PUNCTUATION,  # Close Punctuation
 83    "Pf": CODEPOINT_FLAG_PUNCTUATION,  # Final Punctuation
 84    "Pi": CODEPOINT_FLAG_PUNCTUATION,  # Initial Punctuation
 85    "Po": CODEPOINT_FLAG_PUNCTUATION,  # Other Punctuation
 86    "Ps": CODEPOINT_FLAG_PUNCTUATION,  # Open Punctuation
 87    "Sc": CODEPOINT_FLAG_SYMBOL,       # Currency Symbol
 88    "Sk": CODEPOINT_FLAG_SYMBOL,       # Modifier Symbol
 89    "Sm": CODEPOINT_FLAG_SYMBOL,       # Math Symbol
 90    "So": CODEPOINT_FLAG_SYMBOL,       # Other Symbol
 91    "Zl": CODEPOINT_FLAG_SEPARATOR,    # Line Separator
 92    "Zp": CODEPOINT_FLAG_SEPARATOR,    # Paragraph Separator
 93    "Zs": CODEPOINT_FLAG_SEPARATOR,    # Space Separator
 94}
 95
 96
 97codepoint_flags = array.array('H', [CODEPOINT_FLAG_UNDEFINED]) * MAX_CODEPOINTS
 98table_whitespace = []
 99table_lowercase = []
100table_uppercase = []
101table_nfd = []
102
103for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
104    # convert codepoint to unicode character
105    char = chr(cpt)
106
107    # codepoint category flags
108    codepoint_flags[cpt] = UNICODE_CATEGORY_TO_FLAG[categ]
109
110    # lowercase conversion
111    if cpt_lower:
112        table_lowercase.append((cpt, cpt_lower))
113
114    # uppercase conversion
115    if cpt_upper:
116        table_uppercase.append((cpt, cpt_upper))
117
118    # NFD normalization
119    norm = ord(unicodedata.normalize('NFD', char)[0])
120    if cpt != norm:
121        table_nfd.append((cpt, norm))
122
123
124# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
125table_whitespace.extend(range(0x0009, 0x000D + 1))
126table_whitespace.extend(range(0x2000, 0x200A + 1))
127table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
128
129
130# sort by codepoint
131table_whitespace.sort()
132table_lowercase.sort()
133table_uppercase.sort()
134table_nfd.sort()
135
136
137# group ranges with same flags
138ranges_flags: list[tuple[int, int]] = [(0, codepoint_flags[0])]  # start, flags
139for codepoint, flags in enumerate(codepoint_flags):
140    if flags != ranges_flags[-1][1]:
141        ranges_flags.append((codepoint, flags))
142ranges_flags.append((MAX_CODEPOINTS, 0x0000))
143
144
145# group ranges with same nfd
146ranges_nfd: list[tuple[int, int, int]] = [(0, 0, 0)]  # start, last, nfd
147for codepoint, norm in table_nfd:
148    start = ranges_nfd[-1][0]
149    if ranges_nfd[-1] != (start, codepoint - 1, norm):
150        ranges_nfd.append(None)  # type: ignore[arg-type]  # dummy, will be replaced below
151        start = codepoint
152    ranges_nfd[-1] = (start, codepoint, norm)
153
154
155# Generate 'unicode-data.cpp':
156#   python ./scripts//gen-unicode-data.py > unicode-data.cpp
157
158def out(line=""):
159    print(line, end='\n')  # noqa
160
161
162out("""\
163// generated with scripts/gen-unicode-data.py
164
165#include "unicode-data.h"
166
167#include <cstdint>
168#include <vector>
169#include <unordered_map>
170#include <unordered_set>
171""")
172
173out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1")
174for codepoint, flags in ranges_flags:
175    out("{0x%06X, 0x%04X}," % (codepoint, flags))
176out("};\n")
177
178out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
179for codepoint in table_whitespace:
180    out("0x%06X," % codepoint)
181out("};\n")
182
183out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
184for tuple_lw in table_lowercase:
185    out("{0x%06X, 0x%06X}," % tuple_lw)
186out("};\n")
187
188out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
189for tuple_up in table_uppercase:
190    out("{0x%06X, 0x%06X}," % tuple_up)
191out("};\n")
192
193out("const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd")
194for triple in ranges_nfd:
195    out("{0x%06X, 0x%06X, 0x%06X}," % triple)
196out("};\n")