1 #!/usr/bin/env python3
2
3 """usage: ./gen-ucd-table ucd.nounihan.grouped.xml [/path/to/hb-common.h]
4
5 Input file:
6 * https://unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
7 """
8
9 import sys, re
10 import logging
11 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
12
13 if len (sys.argv) not in (2, 3):
14 sys.exit (__doc__)
15
16 # https://github.com/harfbuzz/packtab
17 import packTab
18 import packTab.ucdxml
19
20 logging.info('Loading UCDXML...')
21 ucdxml = packTab.ucdxml.load_ucdxml(sys.argv[1])
22 ucd = packTab.ucdxml.ucdxml_get_repertoire(ucdxml)
23
24 hb_common_h = 'hb-common.h' if len (sys.argv) < 3 else sys.argv[2]
25
26 logging.info('Preparing data tables...')
27
28
29 # This is how the data is encoded:
30 #
31 # General_Category (gc), Canonical_Combining_Class (ccc),
32 # and Script (sc) are encoded as integers.
33 #
34 # Mirroring character (bmg) is encoded as difference from
35 # the original character.
36 #
37 # Composition & Decomposition (dm) are encoded elaborately,
38 # as discussed below.
39
40 gc = [u['gc'] for u in ucd]
41 ccc = [int(u['ccc']) for u in ucd]
42 bmg = [int(v, 16) - int(u) if v else 0 for u,v in enumerate(u['bmg'] for u in ucd)]
43 sc = [u['sc'] for u in ucd]
44
45
46 # Prepare Compose / Decompose data
47 #
48 # This code is very dense. See hb_ucd_compose() / hb_ucd_decompose() for the logic.
49
50 dm = {i:tuple(int(v, 16) for v in u['dm'].split()) for i,u in enumerate(ucd)
51 if u['dm'] != '#' and u['dt'] == 'can' and not (0xAC00 <= i < 0xAC00+11172)}
52 ce = {i for i,u in enumerate(ucd) if u['Comp_Ex'] == 'Y'}
53
54 assert not any(v for v in dm.values() if len(v) not in (1,2))
55 dm1 = sorted(set(v for v in dm.values() if len(v) == 1))
56 assert all((v[0] >> 16) in (0,2) for v in dm1)
57 dm1_p0_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 0]
58 dm1_p2_array = ['0x%04Xu' % (v[0] & 0xFFFF) for v in dm1 if (v[0] >> 16) == 2]
59 dm1_order = {v:i+1 for i,v in enumerate(dm1)}
60
61 dm2 = sorted((v+(i if i not in ce and not ccc[i] else 0,), v)
62 for i,v in dm.items() if len(v) == 2)
63
64 filt = lambda v: ((v[0] & 0xFFFFF800) == 0x0000 and
65 (v[1] & 0xFFFFFF80) == 0x0300 and
66 (v[2] & 0xFFF0C000) == 0x0000)
67 dm2_u32_array = [v for v in dm2 if filt(v[0])]
68 dm2_u64_array = [v for v in dm2 if not filt(v[0])]
69 assert dm2_u32_array + dm2_u64_array == dm2
70 dm2_u32_array = ["HB_CODEPOINT_ENCODE3_11_7_14 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u32_array]
71 dm2_u64_array = ["HB_CODEPOINT_ENCODE3 (0x%04Xu, 0x%04Xu, 0x%04Xu)" % v[0] for v in dm2_u64_array]
72
73 l = 1 + len(dm1_p0_array) + len(dm1_p2_array)
74 dm2_order = {v[1]:i+l for i,v in enumerate(dm2)}
75
76 dm_order = {None: 0}
77 dm_order.update(dm1_order)
78 dm_order.update(dm2_order)
79
80
81 # Prepare General_Category / Script mapping arrays
82
83 gc_order = dict()
84 for i,v in enumerate(('Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu',
85 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf',
86 'Pi', 'Po', 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',)):
87 gc_order[i] = v
88 gc_order[v] = i
89
90 sc_order = dict()
91 sc_array = []
92 sc_re = re.compile(r"\b(HB_SCRIPT_[_A-Z]*).*HB_TAG [(]'(.)','(.)','(.)','(.)'[)]")
93 for line in open(hb_common_h):
94 m = sc_re.search (line)
95 if not m: continue
96 name = m.group(1)
97 tag = ''.join(m.group(i) for i in range(2, 6))
98 i = len(sc_array)
99 sc_order[tag] = i
100 sc_order[i] = tag
101 sc_array.append(name)
102
103
104 # Write out main data
105
106 DEFAULT = 'DEFAULT'
107 COMPACT = 'COMPACT'
108 SLOPPY = 'SLOPPY'
109
110 compression_level = {
111 DEFAULT: 5,
112 COMPACT: 9,
113 SLOPPY: 9,
114 }
115
116 logging.info('Generating output...')
117 print("/* == Start of generated table == */")
118 print("/*")
119 print(" * The following table is generated by running:")
120 print(" *")
121 print(" * ./gen-ucd-table.py ucd.nounihan.grouped.xml")
122 print(" *")
123 print(" * on file with this description:", ucdxml.description)
124 print(" */")
125 print()
126 print("#ifndef HB_UCD_TABLE_HH")
127 print("#define HB_UCD_TABLE_HH")
128 print()
129 print('#include "hb.hh"')
130 print()
131
132
133 # Write mapping data
134
135 code = packTab.Code('_hb_ucd')
136 sc_array, _ = code.addArray('hb_script_t', 'sc_map', sc_array)
137 dm1_p0_array, _ = code.addArray('uint16_t', 'dm1_p0_map', dm1_p0_array)
138 dm1_p2_array, _ = code.addArray('uint16_t', 'dm1_p2_map', dm1_p2_array)
139 dm2_u32_array, _ = code.addArray('uint32_t', 'dm2_u32_map', dm2_u32_array)
140 dm2_u64_array, _ = code.addArray('uint64_t', 'dm2_u64_map', dm2_u64_array)
141 code.print_c(linkage='static inline')
142
143 datasets = [
144 ('gc', gc, 'Cn', gc_order),
145 ('ccc', ccc, 0, None),
146 ('bmg', bmg, 0, None),
147 ('sc', sc, 'Zzzz', sc_order),
148 ('dm', dm, None, dm_order),
149 ]
150
151
152 # Write main data
153
154 for step in (DEFAULT, COMPACT, SLOPPY):
155 compression = compression_level[step]
156 logging.info(' Compression=%d:' % compression)
157 print()
158 if step == DEFAULT:
159 print('#ifndef HB_OPTIMIZE_SIZE')
160 elif step == COMPACT:
161 print('#elif !defined(HB_NO_UCD_UNASSIGNED)')
162 elif step == SLOPPY:
163 print('#else')
164 else:
165 assert False
166 print()
167
168 if step == SLOPPY:
169 for i in range(len(gc)):
170 if (i % 128) and gc[i] == 'Cn':
171 gc[i] = gc[i - 1]
172 for i in range(len(gc) - 2, -1, -1):
173 if ((i + 1) % 128) and gc[i] == 'Cn':
174 gc[i] = gc[i + 1]
175 for i in range(len(sc)):
176 if (i % 128) and sc[i] == 'Zzzz':
177 sc[i] = sc[i - 1]
178 for i in range(len(sc) - 2, -1, -1):
179 if ((i + 1) % 128) and sc[i] == 'Zzzz':
180 sc[i] = sc[i + 1]
181
182
183 code = packTab.Code('_hb_ucd')
184
185 for name,data,default,mapping in datasets:
186 sol = packTab.pack_table(data, default, mapping=mapping, compression=compression)
187 logging.info(' Dataset=%-8s FullCost=%d' % (name, sol.fullCost))
188 sol.genCode(code, name)
189
190 code.print_c(linkage='static inline')
191
192 print()
193
194
195 print('#endif')
196 print()
197
198 print()
199 print("#endif /* HB_UCD_TABLE_HH */")
200 print()
201 print("/* == End of generated table == */")
202 logging.info('Done.')