1 #!/usr/bin/env python3
2 # flake8: noqa: F821
3
4 """usage: ./gen-use-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt
5
6 Input files:
7 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
8 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
9 * https://unicode.org/Public/UCD/latest/ucd/ArabicShaping.txt
10 * https://unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
11 * https://unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
12 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
13 * https://unicode.org/Public/UCD/latest/ucd/Scripts.txt
14 * ms-use/IndicSyllabicCategory-Additional.txt
15 * ms-use/IndicPositionalCategory-Additional.txt
16 """
17
18 import logging
19 logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
20
21
22 import sys
23
24 if len (sys.argv) != 10:
25 sys.exit (__doc__)
26
27 DISABLED_SCRIPTS = {
28 'Arabic',
29 'Lao',
30 'Samaritan',
31 'Syriac',
32 'Thai',
33 }
34
35 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
36
37 headers = [[f.readline () for i in range (2)] for j,f in enumerate(files) if j != 4]
38 for j in range(7, 9):
39 for line in files[j]:
40 line = line.rstrip()
41 if not line:
42 break
43 headers[j - 1].append(line)
44 headers.append (["UnicodeData.txt does not have a header."])
45
46 unicode_data = [{} for _ in files]
47 values = [{} for _ in files]
48 for i, f in enumerate (files):
49 for line in f:
50
51 j = line.find ('#')
52 if j >= 0:
53 line = line[:j]
54
55 fields = [x.strip () for x in line.split (';')]
56 if len (fields) == 1:
57 continue
58
59 uu = fields[0].split ('..')
60 start = int (uu[0], 16)
61 if len (uu) == 1:
62 end = start
63 else:
64 end = int (uu[1], 16)
65
66 t = fields[1 if i not in [2, 4] else 2]
67
68 if i == 2:
69 t = 'jt_' + t
70 elif i == 3 and t != 'Default_Ignorable_Code_Point':
71 continue
72 elif i == 7 and t == 'Consonant_Final_Modifier':
73 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/336
74 t = 'Syllable_Modifier'
75 elif i == 8 and t == 'NA':
76 t = 'Not_Applicable'
77
78 i0 = i if i < 7 else i - 7
79 for u in range (start, end + 1):
80 unicode_data[i0][u] = t
81 values[i0][t] = values[i0].get (t, 0) + end - start + 1
82
83 defaults = ('Other', 'Not_Applicable', 'jt_X', '', 'Cn', 'No_Block', 'Unknown')
84
85 # Merge data into one dict:
86 for i,v in enumerate (defaults):
87 values[i][v] = values[i].get (v, 0) + 1
88 combined = {}
89 for i,d in enumerate (unicode_data):
90 for u,v in d.items ():
91 if not u in combined:
92 if i >= 4:
93 continue
94 combined[u] = list (defaults)
95 combined[u][i] = v
96 combined = {k: v for k, v in combined.items() if v[6] not in DISABLED_SCRIPTS}
97
98
99 property_names = [
100 # General_Category
101 'Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 'Mc',
102 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po',
103 'Ps', 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs',
104 # Indic_Syllabic_Category
105 'Other',
106 'Bindu',
107 'Visarga',
108 'Avagraha',
109 'Nukta',
110 'Virama',
111 'Pure_Killer',
112 'Invisible_Stacker',
113 'Vowel_Independent',
114 'Vowel_Dependent',
115 'Vowel',
116 'Consonant_Placeholder',
117 'Consonant',
118 'Consonant_Dead',
119 'Consonant_With_Stacker',
120 'Consonant_Prefixed',
121 'Consonant_Preceding_Repha',
122 'Consonant_Succeeding_Repha',
123 'Consonant_Subjoined',
124 'Consonant_Medial',
125 'Consonant_Final',
126 'Consonant_Head_Letter',
127 'Consonant_Initial_Postfixed',
128 'Modifying_Letter',
129 'Tone_Letter',
130 'Tone_Mark',
131 'Gemination_Mark',
132 'Cantillation_Mark',
133 'Register_Shifter',
134 'Syllable_Modifier',
135 'Consonant_Killer',
136 'Non_Joiner',
137 'Joiner',
138 'Number_Joiner',
139 'Number',
140 'Brahmi_Joining_Number',
141 'Symbol_Modifier',
142 'Hieroglyph',
143 'Hieroglyph_Joiner',
144 'Hieroglyph_Mark_Begin',
145 'Hieroglyph_Mark_End',
146 'Hieroglyph_Mirror',
147 'Hieroglyph_Modifier',
148 'Hieroglyph_Segment_Begin',
149 'Hieroglyph_Segment_End',
150 # Indic_Positional_Category
151 'Not_Applicable',
152 'Right',
153 'Left',
154 'Visual_Order_Left',
155 'Left_And_Right',
156 'Top',
157 'Bottom',
158 'Top_And_Bottom',
159 'Top_And_Bottom_And_Left',
160 'Top_And_Right',
161 'Top_And_Left',
162 'Top_And_Left_And_Right',
163 'Bottom_And_Left',
164 'Bottom_And_Right',
165 'Top_And_Bottom_And_Right',
166 'Overstruck',
167 # Joining_Type
168 'jt_C',
169 'jt_D',
170 'jt_L',
171 'jt_R',
172 'jt_T',
173 'jt_U',
174 'jt_X',
175 ]
176
177 class ESC[4;38;5;81mPropertyValue(ESC[4;38;5;149mobject):
178 def __init__(self, name_):
179 self.name = name_
180 def __str__(self):
181 return self.name
182 def __eq__(self, other):
183 return self.name == (other if isinstance(other, str) else other.name)
184 def __ne__(self, other):
185 return not (self == other)
186 def __hash__(self):
187 return hash(str(self))
188
189 property_values = {}
190
191 for name in property_names:
192 value = PropertyValue(name)
193 assert value not in property_values
194 assert value not in globals()
195 property_values[name] = value
196 globals().update(property_values)
197
198
199 def is_BASE(U, UISC, UDI, UGC, AJT):
200 return (UISC in [Number, Consonant, Consonant_Head_Letter,
201 Tone_Letter,
202 Vowel_Independent,
203 ] or
204 # TODO: https://github.com/MicrosoftDocs/typography-issues/issues/484
205 AJT in [jt_C, jt_D, jt_L, jt_R] and UISC != Joiner or
206 (UGC == Lo and UISC in [Avagraha, Bindu, Consonant_Final, Consonant_Medial,
207 Consonant_Subjoined, Vowel, Vowel_Dependent]))
208 def is_BASE_NUM(U, UISC, UDI, UGC, AJT):
209 return UISC == Brahmi_Joining_Number
210 def is_BASE_OTHER(U, UISC, UDI, UGC, AJT):
211 if UISC == Consonant_Placeholder: return True
212 return U in [0x2015, 0x2022, 0x25FB, 0x25FC, 0x25FD, 0x25FE]
213 def is_CGJ(U, UISC, UDI, UGC, AJT):
214 # Also includes VARIATION_SELECTOR and ZWJ
215 return UISC == Joiner or UDI and UGC in [Mc, Me, Mn]
216 def is_CONS_FINAL(U, UISC, UDI, UGC, AJT):
217 return ((UISC == Consonant_Final and UGC != Lo) or
218 UISC == Consonant_Succeeding_Repha)
219 def is_CONS_FINAL_MOD(U, UISC, UDI, UGC, AJT):
220 return UISC == Syllable_Modifier
221 def is_CONS_MED(U, UISC, UDI, UGC, AJT):
222 # Consonant_Initial_Postfixed is new in Unicode 11; not in the spec.
223 return (UISC == Consonant_Medial and UGC != Lo or
224 UISC == Consonant_Initial_Postfixed)
225 def is_CONS_MOD(U, UISC, UDI, UGC, AJT):
226 return UISC in [Nukta, Gemination_Mark, Consonant_Killer]
227 def is_CONS_SUB(U, UISC, UDI, UGC, AJT):
228 return UISC == Consonant_Subjoined and UGC != Lo
229 def is_CONS_WITH_STACKER(U, UISC, UDI, UGC, AJT):
230 return UISC == Consonant_With_Stacker
231 def is_HALANT(U, UISC, UDI, UGC, AJT):
232 return UISC == Virama and not is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT)
233 def is_HALANT_OR_VOWEL_MODIFIER(U, UISC, UDI, UGC, AJT):
234 # Split off of HALANT
235 return U == 0x0DCA
236 def is_HALANT_NUM(U, UISC, UDI, UGC, AJT):
237 return UISC == Number_Joiner
238 def is_HIEROGLYPH(U, UISC, UDI, UGC, AJT):
239 return UISC == Hieroglyph
240 def is_HIEROGLYPH_JOINER(U, UISC, UDI, UGC, AJT):
241 return UISC == Hieroglyph_Joiner
242 def is_HIEROGLYPH_MIRROR(U, UISC, UDI, UGC, AJT):
243 return UISC == Hieroglyph_Mirror
244 def is_HIEROGLYPH_MOD(U, UISC, UDI, UGC, AJT):
245 return UISC == Hieroglyph_Modifier
246 def is_HIEROGLYPH_SEGMENT_BEGIN(U, UISC, UDI, UGC, AJT):
247 return UISC in [Hieroglyph_Mark_Begin, Hieroglyph_Segment_Begin]
248 def is_HIEROGLYPH_SEGMENT_END(U, UISC, UDI, UGC, AJT):
249 return UISC in [Hieroglyph_Mark_End, Hieroglyph_Segment_End]
250 def is_INVISIBLE_STACKER(U, UISC, UDI, UGC, AJT):
251 # Split off of HALANT
252 return (UISC == Invisible_Stacker
253 and not is_SAKOT(U, UISC, UDI, UGC, AJT)
254 )
255 def is_ZWNJ(U, UISC, UDI, UGC, AJT):
256 return UISC == Non_Joiner
257 def is_OTHER(U, UISC, UDI, UGC, AJT):
258 # Also includes BASE_IND and SYM
259 return ((UGC == Po or UISC in [Consonant_Dead, Joiner, Modifying_Letter, Other])
260 and not is_BASE(U, UISC, UDI, UGC, AJT)
261 and not is_BASE_OTHER(U, UISC, UDI, UGC, AJT)
262 and not is_CGJ(U, UISC, UDI, UGC, AJT)
263 and not is_SYM_MOD(U, UISC, UDI, UGC, AJT)
264 and not is_Word_Joiner(U, UISC, UDI, UGC, AJT)
265 )
266 def is_REPHA(U, UISC, UDI, UGC, AJT):
267 return UISC in [Consonant_Preceding_Repha, Consonant_Prefixed]
268 def is_SAKOT(U, UISC, UDI, UGC, AJT):
269 # Split off of HALANT
270 return U == 0x1A60
271 def is_SYM_MOD(U, UISC, UDI, UGC, AJT):
272 return UISC == Symbol_Modifier
273 def is_VOWEL(U, UISC, UDI, UGC, AJT):
274 return (UISC == Pure_Killer or
275 UGC != Lo and UISC in [Vowel, Vowel_Dependent])
276 def is_VOWEL_MOD(U, UISC, UDI, UGC, AJT):
277 return (UISC in [Tone_Mark, Cantillation_Mark, Register_Shifter, Visarga] or
278 UGC != Lo and UISC == Bindu)
279 def is_Word_Joiner(U, UISC, UDI, UGC, AJT):
280 # Also includes Rsv
281 return (UDI and U not in [0x115F, 0x1160, 0x3164, 0xFFA0, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3]
282 and UISC == Other
283 and not is_CGJ(U, UISC, UDI, UGC, AJT)
284 ) or UGC == Cn
285
286 use_mapping = {
287 'B': is_BASE,
288 'N': is_BASE_NUM,
289 'GB': is_BASE_OTHER,
290 'CGJ': is_CGJ,
291 'F': is_CONS_FINAL,
292 'FM': is_CONS_FINAL_MOD,
293 'M': is_CONS_MED,
294 'CM': is_CONS_MOD,
295 'SUB': is_CONS_SUB,
296 'CS': is_CONS_WITH_STACKER,
297 'H': is_HALANT,
298 'HVM': is_HALANT_OR_VOWEL_MODIFIER,
299 'HN': is_HALANT_NUM,
300 'IS': is_INVISIBLE_STACKER,
301 'G': is_HIEROGLYPH,
302 'HM': is_HIEROGLYPH_MOD,
303 'HR': is_HIEROGLYPH_MIRROR,
304 'J': is_HIEROGLYPH_JOINER,
305 'SB': is_HIEROGLYPH_SEGMENT_BEGIN,
306 'SE': is_HIEROGLYPH_SEGMENT_END,
307 'ZWNJ': is_ZWNJ,
308 'O': is_OTHER,
309 'R': is_REPHA,
310 'Sk': is_SAKOT,
311 'SM': is_SYM_MOD,
312 'V': is_VOWEL,
313 'VM': is_VOWEL_MOD,
314 'WJ': is_Word_Joiner,
315 }
316
317 use_positions = {
318 'F': {
319 'Abv': [Top],
320 'Blw': [Bottom],
321 'Pst': [Right],
322 },
323 'M': {
324 'Abv': [Top],
325 'Blw': [Bottom, Bottom_And_Left, Bottom_And_Right],
326 'Pst': [Right],
327 'Pre': [Left, Top_And_Bottom_And_Left],
328 },
329 'CM': {
330 'Abv': [Top],
331 'Blw': [Bottom, Overstruck],
332 },
333 'V': {
334 'Abv': [Top, Top_And_Bottom, Top_And_Bottom_And_Right, Top_And_Right],
335 'Blw': [Bottom, Overstruck, Bottom_And_Right],
336 'Pst': [Right],
337 'Pre': [Left, Top_And_Left, Top_And_Left_And_Right, Left_And_Right],
338 },
339 'VM': {
340 'Abv': [Top],
341 'Blw': [Bottom, Overstruck],
342 'Pst': [Right],
343 'Pre': [Left],
344 },
345 'SM': {
346 'Abv': [Top],
347 'Blw': [Bottom],
348 },
349 'H': None,
350 'HM': None,
351 'HR': None,
352 'HVM': None,
353 'IS': None,
354 'B': None,
355 'FM': {
356 'Abv': [Top],
357 'Blw': [Bottom],
358 'Pst': [Not_Applicable],
359 },
360 'R': None,
361 'SUB': None,
362 }
363
364 def map_to_use(data):
365 out = {}
366 items = use_mapping.items()
367 for U, (UISC, UIPC, AJT, UDI, UGC, UBlock, _) in data.items():
368
369 # Resolve Indic_Syllabic_Category
370
371 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
372 if 0x1CE2 <= U <= 0x1CE8: UISC = Cantillation_Mark
373
374 # Tibetan:
375 # TODO: These don't have UISC assigned in Unicode 13.0.0, but have UIPC
376 if 0x0F18 <= U <= 0x0F19 or 0x0F3E <= U <= 0x0F3F: UISC = Vowel_Dependent
377
378 # TODO: U+1CED should only be allowed after some of
379 # the nasalization marks, maybe only for U+1CE9..U+1CF1.
380 if U == 0x1CED: UISC = Tone_Mark
381
382 values = [k for k,v in items if v(U, UISC, UDI, UGC, AJT)]
383 assert len(values) == 1, "%s %s %s %s %s %s" % (hex(U), UISC, UDI, UGC, AJT, values)
384 USE = values[0]
385
386 # Resolve Indic_Positional_Category
387
388 # TODO: https://github.com/harfbuzz/harfbuzz/pull/1037
389 # and https://github.com/harfbuzz/harfbuzz/issues/1631
390 if U in [0x11302, 0x11303, 0x114C1]: UIPC = Top
391
392 assert (UIPC in [Not_Applicable, Visual_Order_Left] or U == 0x0F7F or
393 USE in use_positions), "%s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT)
394
395 pos_mapping = use_positions.get(USE, None)
396 if pos_mapping:
397 values = [k for k,v in pos_mapping.items() if v and UIPC in v]
398 assert len(values) == 1, "%s %s %s %s %s %s %s %s" % (hex(U), UIPC, USE, UISC, UDI, UGC, AJT, values)
399 USE = USE + values[0]
400
401 out[U] = (USE, UBlock)
402 return out
403
404 use_data = map_to_use(combined)
405
406 print ("/* == Start of generated table == */")
407 print ("/*")
408 print (" * The following table is generated by running:")
409 print (" *")
410 print (" * {} IndicSyllabicCategory.txt IndicPositionalCategory.txt ArabicShaping.txt DerivedCoreProperties.txt UnicodeData.txt Blocks.txt Scripts.txt IndicSyllabicCategory-Additional.txt IndicPositionalCategory-Additional.txt".format (sys.argv[0]))
411 print (" *")
412 print (" * on files with these headers:")
413 print (" *")
414 for h in headers:
415 for l in h:
416 print (" * %s" % (l.strip()))
417 print (" */")
418 print ()
419 print ("#ifndef HB_OT_SHAPER_USE_TABLE_HH")
420 print ("#define HB_OT_SHAPER_USE_TABLE_HH")
421 print ()
422 print ('#include "hb.hh"')
423 print ()
424 print ('#include "hb-ot-shaper-use-machine.hh"')
425 print ()
426
427 total = 0
428 used = 0
429 last_block = None
430 def print_block (block, start, end, use_data):
431 global total, used, last_block
432 if block and block != last_block:
433 print ()
434 print ()
435 print (" /* %s */" % block)
436 if start % 16:
437 print (' ' * (20 + (start % 16 * 6)), end='')
438 num = 0
439 assert start % 8 == 0
440 assert (end+1) % 8 == 0
441 for u in range (start, end+1):
442 if u % 16 == 0:
443 print ()
444 print (" /* %04X */" % u, end='')
445 if u in use_data:
446 num += 1
447 d = use_data.get (u)
448 if d is not None:
449 d = d[0]
450 elif u in unicode_data[4]:
451 d = 'O'
452 else:
453 d = 'WJ'
454 print ("%6s," % d, end='')
455
456 total += end - start + 1
457 used += num
458 if block:
459 last_block = block
460
461 uu = sorted (use_data.keys ())
462
463 last = -100000
464 num = 0
465 offset = 0
466 starts = []
467 ends = []
468 print ('#pragma GCC diagnostic push')
469 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
470 for k,v in sorted(use_mapping.items()):
471 if k in use_positions and use_positions[k]: continue
472 print ("#define %s USE(%s) /* %s */" % (k, k, v.__name__[3:]))
473 for k,v in sorted(use_positions.items()):
474 if not v: continue
475 for suf in v.keys():
476 tag = k + suf
477 print ("#define %s USE(%s)" % (tag, tag))
478 print ('#pragma GCC diagnostic pop')
479 print ("")
480
481
482 import packTab
483 data = {u:v[0] for u,v in use_data.items()}
484
485 DEFAULT = 5
486 COMPACT = 9
487 for compression in (DEFAULT, COMPACT):
488
489 logging.info(' Compression=%d:' % compression)
490 print()
491 if compression == DEFAULT:
492 print('#ifndef HB_OPTIMIZE_SIZE')
493 elif compression == COMPACT:
494 print('#else')
495 else:
496 assert False
497 print()
498
499 code = packTab.Code('hb_use')
500 sol = packTab.pack_table(data, compression=compression, default='O')
501 logging.info(' FullCost=%d' % (sol.fullCost))
502 sol.genCode(code, f'get_category')
503 code.print_c(linkage='static inline')
504 print ()
505
506 print('#endif')
507
508 print ()
509 for k in sorted(use_mapping.keys()):
510 if k in use_positions and use_positions[k]: continue
511 print ("#undef %s" % k)
512 for k,v in sorted(use_positions.items()):
513 if not v: continue
514 for suf in v.keys():
515 tag = k + suf
516 print ("#undef %s" % tag)
517 print ()
518 print ()
519 print ("#endif /* HB_OT_SHAPER_USE_TABLE_HH */")
520 print ("/* == End of generated table == */")