-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_unicode_confusables_data.py
135 lines (110 loc) · 3.78 KB
/
generate_unicode_confusables_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# Needs UnicodeData.txt and confusables.txt in the current directory.
#
# Those can be obtained from unicode.org:
# - http://www.unicode.org/Public/security/<VERSION>/confusables.txt
# - http://www.unicode.org/Public/<VERSION>/ucd/UnicodeData.txt
#
# If executed as a script, it will generate the contents of the files
# python3 scripts/generate_unicode_confusables_data.py header > `src/base/unicode/confusables.h`,
# python3 scripts/generate_unicode_confusables_data.py data > `src/base/unicode/confusables_data.h`.
import sys
import unicode
def generate_decompositions():
ud = unicode.data()
con = unicode.confusables()
def category(x):
return {unicode.unhex(u["Value"]) for u in ud if u["General_Category"].startswith(x)}
# TODO: Is this correct? They changed the decompositioning format
nfd = {unicode.unhex(u["Value"]): unicode.unhex_sequence(u["Decomposition_Type"]) for u in ud}
nfd = {k: v for k, v in nfd.items() if v}
con = {unicode.unhex(c["Value"]): unicode.unhex_sequence(c["Target"]) for c in con}
# C: Control
# M: Combining
# Z: Space
ignore = category("C") | category("M") | category("Z")
con[0x006C] = [0x0069] # LATIN SMALL LETTER L -> LATIN SMALL LETTER I
con[0x00A1] = [0x0069] # INVERTED EXCLAMATION MARK -> LATIN SMALL LETTER I
con[0x2800] = [] # BRAILLE PATTERN BLANK
con[0xFFFC] = [] # OBJECT REPLACEMENT CHARACTER
interesting = ignore | set(nfd) | set(con)
def apply(l, replacements):
return [d for c in l for d in replacements.get(c, [c])]
def gen(c):
result = [c]
while True:
first = apply(result, nfd)
second = apply(first, con)
# Apply substitutions until convergence.
if result == first and result == second:
break
result = second
return [c for c in result if c not in ignore]
return {c: gen(c) for c in interesting}
def gen_header(decompositions, len_set):
print("""\
#include <stdint.h>
struct DECOMP_SLICE
{
\tuint16_t offset : 13;
\tuint16_t length : 3;
};
""")
print("enum")
print("{")
print(f"\tNUM_DECOMP_LENGTHS = {len(len_set)},")
print(f"\tNUM_DECOMPS = {len(decompositions)},")
print("};")
print()
print("extern const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS];")
print("extern const int32_t decomp_chars[NUM_DECOMPS];")
print("extern const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS];")
print("extern const int32_t decomp_data[];")
def gen_data(decompositions, decomposition_set, decomposition_offsets, len_set):
print("""\
#ifndef CONFUSABLES_DATA
#error "This file should only be included in `confusables.cpp`"
#endif
""")
print("const uint8_t decomp_lengths[NUM_DECOMP_LENGTHS] = {")
for l in len_set:
print(f"\t{l},")
print("};")
print()
print("const int32_t decomp_chars[NUM_DECOMPS] = {")
for k in sorted(decompositions):
print(f"\t0x{k:x},")
print("};")
print()
print("const struct DECOMP_SLICE decomp_slices[NUM_DECOMPS] = {")
for k in sorted(decompositions):
d = decompositions[k]
i = decomposition_set.index(tuple(d))
l = len_set.index(len(d))
print(f"\t{{{decomposition_offsets[i]}, {l}}},")
print("};")
print()
print("const int32_t decomp_data[] = {")
for d in decomposition_set:
for c in d:
print(f"\t0x{c:x},")
print("};")
def main():
decompositions = generate_decompositions()
# Deduplicate
decomposition_set = sorted(set(tuple(x) for x in decompositions.values()))
len_set = sorted(set(len(x) for x in decomposition_set))
if len(len_set) > 8:
raise ValueError("Can't pack offset (13 bit) together with len (>3bit)")
cur_offset = 0
decomposition_offsets = []
for d in decomposition_set:
decomposition_offsets.append(cur_offset)
cur_offset += len(d)
header = "header" in sys.argv
data = "data" in sys.argv
if header:
gen_header(decompositions, len_set)
elif data:
gen_data(decompositions, decomposition_set, decomposition_offsets, len_set)
if __name__ == '__main__':
main()