1 #
2 # (re)generate unicode property and type databases
3 #
4 # This script converts Unicode database files to Modules/unicodedata_db.h,
5 # Modules/unicodename_db.h, and Objects/unicodetype_db.h
6 #
7 # history:
8 # 2000-09-24 fl created (based on bits and pieces from unidb)
9 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
10 # 2000-09-25 fl added character type table
11 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12 # 2000-11-03 fl expand first/last ranges
13 # 2001-01-19 fl added character name tables (2.1)
14 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
15 # 2002-09-11 wd use string methods
16 # 2002-10-18 mvl update to Unicode 3.2
17 # 2002-10-22 mvl generate NFC tables
18 # 2002-11-24 mvl expand all ranges, sort names version-independently
19 # 2002-11-25 mvl add UNIDATA_VERSION
20 # 2004-05-29 perky add east asian width information
21 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
22 # 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23 # 2011-10-21 ezio add support for name aliases and named sequences
24 # 2012-01 benjamin add full case mappings
25 #
26 # written by Fredrik Lundh (fredrik@pythonware.com)
27 #
28
29 import dataclasses
30 import os
31 import sys
32 import zipfile
33
34 from functools import partial
35 from textwrap import dedent
36 from typing import Iterator, List, Optional, Set, Tuple
37
38 SCRIPT = sys.argv[0]
39 VERSION = "3.3"
40
41 # The Unicode Database
42 # --------------------
43 # When changing UCD version please update
44 # * Doc/library/stdtypes.rst, and
45 # * Doc/library/unicodedata.rst
46 # * Doc/reference/lexical_analysis.rst (two occurrences)
47 UNIDATA_VERSION = "14.0.0"
48 UNICODE_DATA = "UnicodeData%s.txt"
49 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
51 UNIHAN = "Unihan%s.zip"
52 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
53 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
54 LINE_BREAK = "LineBreak%s.txt"
55 NAME_ALIASES = "NameAliases%s.txt"
56 NAMED_SEQUENCES = "NamedSequences%s.txt"
57 SPECIAL_CASING = "SpecialCasing%s.txt"
58 CASE_FOLDING = "CaseFolding%s.txt"
59
60 # Private Use Areas -- in planes 1, 15, 16
61 PUA_1 = range(0xE000, 0xF900)
62 PUA_15 = range(0xF0000, 0xFFFFE)
63 PUA_16 = range(0x100000, 0x10FFFE)
64
65 # we use this ranges of PUA_15 to store name aliases and named sequences
66 NAME_ALIASES_START = 0xF0000
67 NAMED_SEQUENCES_START = 0xF0200
68
69 old_versions = ["3.2.0"]
70
71 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74 "So" ]
75
76 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
78 "ON", "LRI", "RLI", "FSI", "PDI" ]
79
80 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
81
82 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
83
84 # note: should match definitions in Objects/unicodectype.c
85 ALPHA_MASK = 0x01
86 DECIMAL_MASK = 0x02
87 DIGIT_MASK = 0x04
88 LOWER_MASK = 0x08
89 LINEBREAK_MASK = 0x10
90 SPACE_MASK = 0x20
91 TITLE_MASK = 0x40
92 UPPER_MASK = 0x80
93 XID_START_MASK = 0x100
94 XID_CONTINUE_MASK = 0x200
95 PRINTABLE_MASK = 0x400
96 NUMERIC_MASK = 0x800
97 CASE_IGNORABLE_MASK = 0x1000
98 CASED_MASK = 0x2000
99 EXTENDED_CASE_MASK = 0x4000
100
101 # these ranges need to match unicodedata.c:is_unified_ideograph
102 cjk_ranges = [
103 ('3400', '4DBF'),
104 ('4E00', '9FFF'),
105 ('20000', '2A6DF'),
106 ('2A700', '2B738'),
107 ('2B740', '2B81D'),
108 ('2B820', '2CEA1'),
109 ('2CEB0', '2EBE0'),
110 ('30000', '3134A'),
111 ]
112
113
114 def maketables(trace=0):
115
116 print("--- Reading", UNICODE_DATA % "", "...")
117
118 unicode = UnicodeData(UNIDATA_VERSION)
119
120 print(len(list(filter(None, unicode.table))), "characters")
121
122 for version in old_versions:
123 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
124 old_unicode = UnicodeData(version, cjk_check=False)
125 print(len(list(filter(None, old_unicode.table))), "characters")
126 merge_old_version(version, unicode, old_unicode)
127
128 makeunicodename(unicode, trace)
129 makeunicodedata(unicode, trace)
130 makeunicodetype(unicode, trace)
131
132
133 # --------------------------------------------------------------------
134 # unicode character properties
135
136 def makeunicodedata(unicode, trace):
137
138 dummy = (0, 0, 0, 0, 0, 0)
139 table = [dummy]
140 cache = {0: dummy}
141 index = [0] * len(unicode.chars)
142
143 FILE = "Modules/unicodedata_db.h"
144
145 print("--- Preparing", FILE, "...")
146
147 # 1) database properties
148
149 for char in unicode.chars:
150 record = unicode.table[char]
151 if record:
152 # extract database properties
153 category = CATEGORY_NAMES.index(record.general_category)
154 combining = int(record.canonical_combining_class)
155 bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
156 mirrored = record.bidi_mirrored == "Y"
157 eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
158 normalizationquickcheck = record.quick_check
159 item = (
160 category, combining, bidirectional, mirrored, eastasianwidth,
161 normalizationquickcheck
162 )
163 # add entry to index and item tables
164 i = cache.get(item)
165 if i is None:
166 cache[item] = i = len(table)
167 table.append(item)
168 index[char] = i
169
170 # 2) decomposition data
171
172 decomp_data = [0]
173 decomp_prefix = [""]
174 decomp_index = [0] * len(unicode.chars)
175 decomp_size = 0
176
177 comp_pairs = []
178 comp_first = [None] * len(unicode.chars)
179 comp_last = [None] * len(unicode.chars)
180
181 for char in unicode.chars:
182 record = unicode.table[char]
183 if record:
184 if record.decomposition_type:
185 decomp = record.decomposition_type.split()
186 if len(decomp) > 19:
187 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
188 # prefix
189 if decomp[0][0] == "<":
190 prefix = decomp.pop(0)
191 else:
192 prefix = ""
193 try:
194 i = decomp_prefix.index(prefix)
195 except ValueError:
196 i = len(decomp_prefix)
197 decomp_prefix.append(prefix)
198 prefix = i
199 assert prefix < 256
200 # content
201 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
202 # Collect NFC pairs
203 if not prefix and len(decomp) == 3 and \
204 char not in unicode.exclusions and \
205 unicode.table[decomp[1]].canonical_combining_class == "0":
206 p, l, r = decomp
207 comp_first[l] = 1
208 comp_last[r] = 1
209 comp_pairs.append((l,r,char))
210 try:
211 i = decomp_data.index(decomp)
212 except ValueError:
213 i = len(decomp_data)
214 decomp_data.extend(decomp)
215 decomp_size = decomp_size + len(decomp) * 2
216 else:
217 i = 0
218 decomp_index[char] = i
219
220 f = l = 0
221 comp_first_ranges = []
222 comp_last_ranges = []
223 prev_f = prev_l = None
224 for i in unicode.chars:
225 if comp_first[i] is not None:
226 comp_first[i] = f
227 f += 1
228 if prev_f is None:
229 prev_f = (i,i)
230 elif prev_f[1]+1 == i:
231 prev_f = prev_f[0],i
232 else:
233 comp_first_ranges.append(prev_f)
234 prev_f = (i,i)
235 if comp_last[i] is not None:
236 comp_last[i] = l
237 l += 1
238 if prev_l is None:
239 prev_l = (i,i)
240 elif prev_l[1]+1 == i:
241 prev_l = prev_l[0],i
242 else:
243 comp_last_ranges.append(prev_l)
244 prev_l = (i,i)
245 comp_first_ranges.append(prev_f)
246 comp_last_ranges.append(prev_l)
247 total_first = f
248 total_last = l
249
250 comp_data = [0]*(total_first*total_last)
251 for f,l,char in comp_pairs:
252 f = comp_first[f]
253 l = comp_last[l]
254 comp_data[f*total_last+l] = char
255
256 print(len(table), "unique properties")
257 print(len(decomp_prefix), "unique decomposition prefixes")
258 print(len(decomp_data), "unique decomposition entries:", end=' ')
259 print(decomp_size, "bytes")
260 print(total_first, "first characters in NFC")
261 print(total_last, "last characters in NFC")
262 print(len(comp_pairs), "NFC pairs")
263
264 print("--- Writing", FILE, "...")
265
266 with open(FILE, "w") as fp:
267 fprint = partial(print, file=fp)
268
269 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
270 fprint()
271 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
272 fprint("/* a list of unique database records */")
273 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
274 for item in table:
275 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
276 fprint("};")
277 fprint()
278
279 fprint("/* Reindexing of NFC first characters. */")
280 fprint("#define TOTAL_FIRST",total_first)
281 fprint("#define TOTAL_LAST",total_last)
282 fprint("struct reindex{int start;short count,index;};")
283 fprint("static struct reindex nfc_first[] = {")
284 for start,end in comp_first_ranges:
285 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
286 fprint(" {0,0,0}")
287 fprint("};\n")
288 fprint("static struct reindex nfc_last[] = {")
289 for start,end in comp_last_ranges:
290 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
291 fprint(" {0,0,0}")
292 fprint("};\n")
293
294 # FIXME: <fl> the following tables could be made static, and
295 # the support code moved into unicodedatabase.c
296
297 fprint("/* string literals */")
298 fprint("const char *_PyUnicode_CategoryNames[] = {")
299 for name in CATEGORY_NAMES:
300 fprint(" \"%s\"," % name)
301 fprint(" NULL")
302 fprint("};")
303
304 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
305 for name in BIDIRECTIONAL_NAMES:
306 fprint(" \"%s\"," % name)
307 fprint(" NULL")
308 fprint("};")
309
310 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
311 for name in EASTASIANWIDTH_NAMES:
312 fprint(" \"%s\"," % name)
313 fprint(" NULL")
314 fprint("};")
315
316 fprint("static const char *decomp_prefix[] = {")
317 for name in decomp_prefix:
318 fprint(" \"%s\"," % name)
319 fprint(" NULL")
320 fprint("};")
321
322 # split record index table
323 index1, index2, shift = splitbins(index, trace)
324
325 fprint("/* index tables for the database records */")
326 fprint("#define SHIFT", shift)
327 Array("index1", index1).dump(fp, trace)
328 Array("index2", index2).dump(fp, trace)
329
330 # split decomposition index table
331 index1, index2, shift = splitbins(decomp_index, trace)
332
333 fprint("/* decomposition data */")
334 Array("decomp_data", decomp_data).dump(fp, trace)
335
336 fprint("/* index tables for the decomposition data */")
337 fprint("#define DECOMP_SHIFT", shift)
338 Array("decomp_index1", index1).dump(fp, trace)
339 Array("decomp_index2", index2).dump(fp, trace)
340
341 index, index2, shift = splitbins(comp_data, trace)
342 fprint("/* NFC pairs */")
343 fprint("#define COMP_SHIFT", shift)
344 Array("comp_index", index).dump(fp, trace)
345 Array("comp_data", index2).dump(fp, trace)
346
347 # Generate delta tables for old versions
348 for version, table, normalization in unicode.changed:
349 cversion = version.replace(".","_")
350 records = [table[0]]
351 cache = {table[0]:0}
352 index = [0] * len(table)
353 for i, record in enumerate(table):
354 try:
355 index[i] = cache[record]
356 except KeyError:
357 index[i] = cache[record] = len(records)
358 records.append(record)
359 index1, index2, shift = splitbins(index, trace)
360 fprint("static const change_record change_records_%s[] = {" % cversion)
361 for record in records:
362 fprint(" { %s }," % ", ".join(map(str,record)))
363 fprint("};")
364 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
365 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
366 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
367 fprint("{")
368 fprint(" int index;")
369 fprint(" if (n >= 0x110000) index = 0;")
370 fprint(" else {")
371 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
372 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
373 (cversion, shift, ((1<<shift)-1)))
374 fprint(" }")
375 fprint(" return change_records_%s+index;" % cversion)
376 fprint("}\n")
377 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
378 fprint("{")
379 fprint(" switch(n) {")
380 for k, v in normalization:
381 fprint(" case %s: return 0x%s;" % (hex(k), v))
382 fprint(" default: return 0;")
383 fprint(" }\n}\n")
384
385
386 # --------------------------------------------------------------------
387 # unicode character type tables
388
389 def makeunicodetype(unicode, trace):
390
391 FILE = "Objects/unicodetype_db.h"
392
393 print("--- Preparing", FILE, "...")
394
395 # extract unicode types
396 dummy = (0, 0, 0, 0, 0, 0)
397 table = [dummy]
398 cache = {0: dummy}
399 index = [0] * len(unicode.chars)
400 numeric = {}
401 spaces = []
402 linebreaks = []
403 extra_casing = []
404
405 for char in unicode.chars:
406 record = unicode.table[char]
407 if record:
408 # extract database properties
409 category = record.general_category
410 bidirectional = record.bidi_class
411 properties = record.binary_properties
412 flags = 0
413 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
414 flags |= ALPHA_MASK
415 if "Lowercase" in properties:
416 flags |= LOWER_MASK
417 if 'Line_Break' in properties or bidirectional == "B":
418 flags |= LINEBREAK_MASK
419 linebreaks.append(char)
420 if category == "Zs" or bidirectional in ("WS", "B", "S"):
421 flags |= SPACE_MASK
422 spaces.append(char)
423 if category == "Lt":
424 flags |= TITLE_MASK
425 if "Uppercase" in properties:
426 flags |= UPPER_MASK
427 if char == ord(" ") or category[0] not in ("C", "Z"):
428 flags |= PRINTABLE_MASK
429 if "XID_Start" in properties:
430 flags |= XID_START_MASK
431 if "XID_Continue" in properties:
432 flags |= XID_CONTINUE_MASK
433 if "Cased" in properties:
434 flags |= CASED_MASK
435 if "Case_Ignorable" in properties:
436 flags |= CASE_IGNORABLE_MASK
437 sc = unicode.special_casing.get(char)
438 cf = unicode.case_folding.get(char, [char])
439 if record.simple_uppercase_mapping:
440 upper = int(record.simple_uppercase_mapping, 16)
441 else:
442 upper = char
443 if record.simple_lowercase_mapping:
444 lower = int(record.simple_lowercase_mapping, 16)
445 else:
446 lower = char
447 if record.simple_titlecase_mapping:
448 title = int(record.simple_titlecase_mapping, 16)
449 else:
450 title = upper
451 if sc is None and cf != [lower]:
452 sc = ([lower], [title], [upper])
453 if sc is None:
454 if upper == lower == title:
455 upper = lower = title = 0
456 else:
457 upper = upper - char
458 lower = lower - char
459 title = title - char
460 assert (abs(upper) <= 2147483647 and
461 abs(lower) <= 2147483647 and
462 abs(title) <= 2147483647)
463 else:
464 # This happens either when some character maps to more than one
465 # character in uppercase, lowercase, or titlecase or the
466 # casefolded version of the character is different from the
467 # lowercase. The extra characters are stored in a different
468 # array.
469 flags |= EXTENDED_CASE_MASK
470 lower = len(extra_casing) | (len(sc[0]) << 24)
471 extra_casing.extend(sc[0])
472 if cf != sc[0]:
473 lower |= len(cf) << 20
474 extra_casing.extend(cf)
475 upper = len(extra_casing) | (len(sc[2]) << 24)
476 extra_casing.extend(sc[2])
477 # Title is probably equal to upper.
478 if sc[1] == sc[2]:
479 title = upper
480 else:
481 title = len(extra_casing) | (len(sc[1]) << 24)
482 extra_casing.extend(sc[1])
483 # decimal digit, integer digit
484 decimal = 0
485 if record.decomposition_mapping:
486 flags |= DECIMAL_MASK
487 decimal = int(record.decomposition_mapping)
488 digit = 0
489 if record.numeric_type:
490 flags |= DIGIT_MASK
491 digit = int(record.numeric_type)
492 if record.numeric_value:
493 flags |= NUMERIC_MASK
494 numeric.setdefault(record.numeric_value, []).append(char)
495 item = (
496 upper, lower, title, decimal, digit, flags
497 )
498 # add entry to index and item tables
499 i = cache.get(item)
500 if i is None:
501 cache[item] = i = len(table)
502 table.append(item)
503 index[char] = i
504
505 print(len(table), "unique character type entries")
506 print(sum(map(len, numeric.values())), "numeric code points")
507 print(len(spaces), "whitespace code points")
508 print(len(linebreaks), "linebreak code points")
509 print(len(extra_casing), "extended case array")
510
511 print("--- Writing", FILE, "...")
512
513 with open(FILE, "w") as fp:
514 fprint = partial(print, file=fp)
515
516 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
517 fprint()
518 fprint("/* a list of unique character type descriptors */")
519 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
520 for item in table:
521 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
522 fprint("};")
523 fprint()
524
525 fprint("/* extended case mappings */")
526 fprint()
527 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
528 for c in extra_casing:
529 fprint(" %d," % c)
530 fprint("};")
531 fprint()
532
533 # split decomposition index table
534 index1, index2, shift = splitbins(index, trace)
535
536 fprint("/* type indexes */")
537 fprint("#define SHIFT", shift)
538 Array("index1", index1).dump(fp, trace)
539 Array("index2", index2).dump(fp, trace)
540
541 # Generate code for _PyUnicode_ToNumeric()
542 numeric_items = sorted(numeric.items())
543 fprint('/* Returns the numeric value as double for Unicode characters')
544 fprint(' * having this property, -1.0 otherwise.')
545 fprint(' */')
546 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
547 fprint('{')
548 fprint(' switch (ch) {')
549 for value, codepoints in numeric_items:
550 # Turn text into float literals
551 parts = value.split('/')
552 parts = [repr(float(part)) for part in parts]
553 value = '/'.join(parts)
554
555 codepoints.sort()
556 for codepoint in codepoints:
557 fprint(' case 0x%04X:' % (codepoint,))
558 fprint(' return (double) %s;' % (value,))
559 fprint(' }')
560 fprint(' return -1.0;')
561 fprint('}')
562 fprint()
563
564 # Generate code for _PyUnicode_IsWhitespace()
565 fprint("/* Returns 1 for Unicode characters having the bidirectional")
566 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
567 fprint(" */")
568 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
569 fprint('{')
570 fprint(' switch (ch) {')
571
572 for codepoint in sorted(spaces):
573 fprint(' case 0x%04X:' % (codepoint,))
574 fprint(' return 1;')
575
576 fprint(' }')
577 fprint(' return 0;')
578 fprint('}')
579 fprint()
580
581 # Generate code for _PyUnicode_IsLinebreak()
582 fprint("/* Returns 1 for Unicode characters having the line break")
583 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
584 fprint(" * type 'B', 0 otherwise.")
585 fprint(" */")
586 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
587 fprint('{')
588 fprint(' switch (ch) {')
589 for codepoint in sorted(linebreaks):
590 fprint(' case 0x%04X:' % (codepoint,))
591 fprint(' return 1;')
592
593 fprint(' }')
594 fprint(' return 0;')
595 fprint('}')
596 fprint()
597
598
599 # --------------------------------------------------------------------
600 # unicode name database
601
602 def makeunicodename(unicode, trace):
603
604 FILE = "Modules/unicodename_db.h"
605
606 print("--- Preparing", FILE, "...")
607
608 # collect names
609 names = [None] * len(unicode.chars)
610
611 for char in unicode.chars:
612 record = unicode.table[char]
613 if record:
614 name = record.name.strip()
615 if name and name[0] != "<":
616 names[char] = name + chr(0)
617
618 print(len([n for n in names if n is not None]), "distinct names")
619
620 # collect unique words from names (note that we differ between
621 # words inside a sentence, and words ending a sentence. the
622 # latter includes the trailing null byte.
623
624 words = {}
625 n = b = 0
626 for char in unicode.chars:
627 name = names[char]
628 if name:
629 w = name.split()
630 b = b + len(name)
631 n = n + len(w)
632 for w in w:
633 l = words.get(w)
634 if l:
635 l.append(None)
636 else:
637 words[w] = [len(words)]
638
639 print(n, "words in text;", b, "bytes")
640
641 wordlist = list(words.items())
642
643 # sort on falling frequency, then by name
644 def word_key(a):
645 aword, alist = a
646 return -len(alist), aword
647 wordlist.sort(key=word_key)
648
649 # figure out how many phrasebook escapes we need
650 escapes = 0
651 while escapes * 256 < len(wordlist):
652 escapes = escapes + 1
653 print(escapes, "escapes")
654
655 short = 256 - escapes
656
657 assert short > 0
658
659 print(short, "short indexes in lexicon")
660
661 # statistics
662 n = 0
663 for i in range(short):
664 n = n + len(wordlist[i][1])
665 print(n, "short indexes in phrasebook")
666
667 # pick the most commonly used words, and sort the rest on falling
668 # length (to maximize overlap)
669
670 wordlist, wordtail = wordlist[:short], wordlist[short:]
671 wordtail.sort(key=lambda a: a[0], reverse=True)
672 wordlist.extend(wordtail)
673
674 # generate lexicon from words
675
676 lexicon_offset = [0]
677 lexicon = ""
678 words = {}
679
680 # build a lexicon string
681 offset = 0
682 for w, x in wordlist:
683 # encoding: bit 7 indicates last character in word (chr(128)
684 # indicates the last character in an entire string)
685 ww = w[:-1] + chr(ord(w[-1])+128)
686 # reuse string tails, when possible
687 o = lexicon.find(ww)
688 if o < 0:
689 o = offset
690 lexicon = lexicon + ww
691 offset = offset + len(w)
692 words[w] = len(lexicon_offset)
693 lexicon_offset.append(o)
694
695 lexicon = list(map(ord, lexicon))
696
697 # generate phrasebook from names and lexicon
698 phrasebook = [0]
699 phrasebook_offset = [0] * len(unicode.chars)
700 for char in unicode.chars:
701 name = names[char]
702 if name:
703 w = name.split()
704 phrasebook_offset[char] = len(phrasebook)
705 for w in w:
706 i = words[w]
707 if i < short:
708 phrasebook.append(i)
709 else:
710 # store as two bytes
711 phrasebook.append((i>>8) + short)
712 phrasebook.append(i&255)
713
714 assert getsize(phrasebook) == 1
715
716 #
717 # unicode name hash table
718
719 # extract names
720 data = []
721 for char in unicode.chars:
722 record = unicode.table[char]
723 if record:
724 name = record.name.strip()
725 if name and name[0] != "<":
726 data.append((name, char))
727
728 # the magic number 47 was chosen to minimize the number of
729 # collisions on the current data set. if you like, change it
730 # and see what happens...
731
732 codehash = Hash("code", data, 47)
733
734 print("--- Writing", FILE, "...")
735
736 with open(FILE, "w") as fp:
737 fprint = partial(print, file=fp)
738
739 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
740 fprint()
741 fprint("#define NAME_MAXLEN", 256)
742 fprint()
743 fprint("/* lexicon */")
744 Array("lexicon", lexicon).dump(fp, trace)
745 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
746
747 # split decomposition index table
748 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
749
750 fprint("/* code->name phrasebook */")
751 fprint("#define phrasebook_shift", shift)
752 fprint("#define phrasebook_short", short)
753
754 Array("phrasebook", phrasebook).dump(fp, trace)
755 Array("phrasebook_offset1", offset1).dump(fp, trace)
756 Array("phrasebook_offset2", offset2).dump(fp, trace)
757
758 fprint("/* name->code dictionary */")
759 codehash.dump(fp, trace)
760
761 fprint()
762 fprint('static const unsigned int aliases_start = %#x;' %
763 NAME_ALIASES_START)
764 fprint('static const unsigned int aliases_end = %#x;' %
765 (NAME_ALIASES_START + len(unicode.aliases)))
766
767 fprint('static const unsigned int name_aliases[] = {')
768 for name, codepoint in unicode.aliases:
769 fprint(' 0x%04X,' % codepoint)
770 fprint('};')
771
772 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
773 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
774 # sequences or sequences with non-BMP chars are added.
775 # unicodedata_lookup should be adapted too.
776 fprint(dedent("""
777 typedef struct NamedSequence {
778 int seqlen;
779 Py_UCS2 seq[4];
780 } named_sequence;
781 """))
782
783 fprint('static const unsigned int named_sequences_start = %#x;' %
784 NAMED_SEQUENCES_START)
785 fprint('static const unsigned int named_sequences_end = %#x;' %
786 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
787
788 fprint('static const named_sequence named_sequences[] = {')
789 for name, sequence in unicode.named_sequences:
790 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
791 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
792 fprint('};')
793
794
795 def merge_old_version(version, new, old):
796 # Changes to exclusion file not implemented yet
797 if old.exclusions != new.exclusions:
798 raise NotImplementedError("exclusions differ")
799
800 # In these change records, 0xFF means "no change"
801 bidir_changes = [0xFF]*0x110000
802 category_changes = [0xFF]*0x110000
803 decimal_changes = [0xFF]*0x110000
804 mirrored_changes = [0xFF]*0x110000
805 east_asian_width_changes = [0xFF]*0x110000
806 # In numeric data, 0 means "no change",
807 # -1 means "did not have a numeric value
808 numeric_changes = [0] * 0x110000
809 # normalization_changes is a list of key-value pairs
810 normalization_changes = []
811 for i in range(0x110000):
812 if new.table[i] is None:
813 # Characters unassigned in the new version ought to
814 # be unassigned in the old one
815 assert old.table[i] is None
816 continue
817 # check characters unassigned in the old version
818 if old.table[i] is None:
819 # category 0 is "unassigned"
820 category_changes[i] = 0
821 continue
822 # check characters that differ
823 if old.table[i] != new.table[i]:
824 for k, field in enumerate(dataclasses.fields(UcdRecord)):
825 value = getattr(old.table[i], field.name)
826 new_value = getattr(new.table[i], field.name)
827 if value != new_value:
828 if k == 1 and i in PUA_15:
829 # the name is not set in the old.table, but in the
830 # new.table we are using it for aliases and named seq
831 assert value == ''
832 elif k == 2:
833 category_changes[i] = CATEGORY_NAMES.index(value)
834 elif k == 4:
835 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
836 elif k == 5:
837 # We assume that all normalization changes are in 1:1 mappings
838 assert " " not in value
839 normalization_changes.append((i, value))
840 elif k == 6:
841 # we only support changes where the old value is a single digit
842 assert value in "0123456789"
843 decimal_changes[i] = int(value)
844 elif k == 8:
845 # Since 0 encodes "no change", the old value is better not 0
846 if not value:
847 numeric_changes[i] = -1
848 else:
849 numeric_changes[i] = float(value)
850 assert numeric_changes[i] not in (0, -1)
851 elif k == 9:
852 if value == 'Y':
853 mirrored_changes[i] = '1'
854 else:
855 mirrored_changes[i] = '0'
856 elif k == 11:
857 # change to ISO comment, ignore
858 pass
859 elif k == 12:
860 # change to simple uppercase mapping; ignore
861 pass
862 elif k == 13:
863 # change to simple lowercase mapping; ignore
864 pass
865 elif k == 14:
866 # change to simple titlecase mapping; ignore
867 pass
868 elif k == 15:
869 # change to east asian width
870 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
871 elif k == 16:
872 # derived property changes; not yet
873 pass
874 elif k == 17:
875 # normalization quickchecks are not performed
876 # for older versions
877 pass
878 else:
879 class ESC[4;38;5;81mDifference(ESC[4;38;5;149mException):pass
880 raise Difference(hex(i), k, old.table[i], new.table[i])
881 new.changed.append((version, list(zip(bidir_changes, category_changes,
882 decimal_changes, mirrored_changes,
883 east_asian_width_changes,
884 numeric_changes)),
885 normalization_changes))
886
887
888 DATA_DIR = os.path.join('Tools', 'unicode', 'data')
889
890 def open_data(template, version):
891 local = os.path.join(DATA_DIR, template % ('-'+version,))
892 if not os.path.exists(local):
893 import urllib.request
894 if version == '3.2.0':
895 # irregular url structure
896 url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
897 else:
898 url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
899 os.makedirs(DATA_DIR, exist_ok=True)
900 urllib.request.urlretrieve(url, filename=local)
901 if local.endswith('.txt'):
902 return open(local, encoding='utf-8')
903 else:
904 # Unihan.zip
905 return open(local, 'rb')
906
907
908 def expand_range(char_range: str) -> Iterator[int]:
909 '''
910 Parses ranges of code points, as described in UAX #44:
911 https://www.unicode.org/reports/tr44/#Code_Point_Ranges
912 '''
913 if '..' in char_range:
914 first, last = [int(c, 16) for c in char_range.split('..')]
915 else:
916 first = last = int(char_range, 16)
917 for char in range(first, last+1):
918 yield char
919
920
921 class ESC[4;38;5;81mUcdFile:
922 '''
923 A file in the standard format of the UCD.
924
925 See: https://www.unicode.org/reports/tr44/#Format_Conventions
926
927 Note that, as described there, the Unihan data files have their
928 own separate format.
929 '''
930
931 def __init__(self, template: str, version: str) -> None:
932 self.template = template
933 self.version = version
934
935 def records(self) -> Iterator[List[str]]:
936 with open_data(self.template, self.version) as file:
937 for line in file:
938 line = line.split('#', 1)[0].strip()
939 if not line:
940 continue
941 yield [field.strip() for field in line.split(';')]
942
943 def __iter__(self) -> Iterator[List[str]]:
944 return self.records()
945
946 def expanded(self) -> Iterator[Tuple[int, List[str]]]:
947 for record in self.records():
948 char_range, rest = record[0], record[1:]
949 for char in expand_range(char_range):
950 yield char, rest
951
952
953 @dataclasses.dataclass
954 class ESC[4;38;5;81mUcdRecord:
955 # 15 fields from UnicodeData.txt . See:
956 # https://www.unicode.org/reports/tr44/#UnicodeData.txt
957 codepoint: str
958 name: str
959 general_category: str
960 canonical_combining_class: str
961 bidi_class: str
962 decomposition_type: str
963 decomposition_mapping: str
964 numeric_type: str
965 numeric_value: str
966 bidi_mirrored: str
967 unicode_1_name: str # obsolete
968 iso_comment: str # obsolete
969 simple_uppercase_mapping: str
970 simple_lowercase_mapping: str
971 simple_titlecase_mapping: str
972
973 # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
974 east_asian_width: Optional[str]
975
976 # Binary properties, as a set of those that are true.
977 # Taken from multiple files:
978 # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
979 # https://www.unicode.org/reports/tr44/#LineBreak.txt
980 binary_properties: Set[str]
981
982 # The Quick_Check properties related to normalization:
983 # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
984 # We store them as a bitmask.
985 quick_check: int
986
987
988 def from_row(row: List[str]) -> UcdRecord:
989 return UcdRecord(*row, None, set(), 0)
990
991
992 # --------------------------------------------------------------------
993 # the following support code is taken from the unidb utilities
994 # Copyright (c) 1999-2000 by Secret Labs AB
995
996 # load a unicode-data file from disk
997
998 class ESC[4;38;5;81mUnicodeData:
999 # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
1000
1001 def __init__(self, version, cjk_check=True):
1002 self.changed = []
1003 table = [None] * 0x110000
1004 for s in UcdFile(UNICODE_DATA, version):
1005 char = int(s[0], 16)
1006 table[char] = from_row(s)
1007
1008 cjk_ranges_found = []
1009
1010 # expand first-last ranges
1011 field = None
1012 for i in range(0, 0x110000):
1013 # The file UnicodeData.txt has its own distinct way of
1014 # expressing ranges. See:
1015 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
1016 s = table[i]
1017 if s:
1018 if s.name[-6:] == "First>":
1019 s.name = ""
1020 field = dataclasses.astuple(s)[:15]
1021 elif s.name[-5:] == "Last>":
1022 if s.name.startswith("<CJK Ideograph"):
1023 cjk_ranges_found.append((field[0],
1024 s.codepoint))
1025 s.name = ""
1026 field = None
1027 elif field:
1028 table[i] = from_row(('%X' % i,) + field[1:])
1029 if cjk_check and cjk_ranges != cjk_ranges_found:
1030 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1031
1032 # public attributes
1033 self.filename = UNICODE_DATA % ''
1034 self.table = table
1035 self.chars = list(range(0x110000)) # unicode 3.2
1036
1037 # check for name aliases and named sequences, see #12753
1038 # aliases and named sequences are not in 3.2.0
1039 if version != '3.2.0':
1040 self.aliases = []
1041 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1042 # in order to take advantage of the compression and lookup
1043 # algorithms used for the other characters
1044 pua_index = NAME_ALIASES_START
1045 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1046 char = int(char, 16)
1047 self.aliases.append((name, char))
1048 # also store the name in the PUA 1
1049 self.table[pua_index].name = name
1050 pua_index += 1
1051 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1052
1053 self.named_sequences = []
1054 # store named sequences in the PUA 1, in range U+F0100..,
1055 # in order to take advantage of the compression and lookup
1056 # algorithms used for the other characters.
1057
1058 assert pua_index < NAMED_SEQUENCES_START
1059 pua_index = NAMED_SEQUENCES_START
1060 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1061 chars = tuple(int(char, 16) for char in chars.split())
1062 # check that the structure defined in makeunicodename is OK
1063 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1064 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1065 "the NamedSequence struct and in unicodedata_lookup")
1066 self.named_sequences.append((name, chars))
1067 # also store these in the PUA 1
1068 self.table[pua_index].name = name
1069 pua_index += 1
1070 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1071
1072 self.exclusions = {}
1073 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1074 char = int(char, 16)
1075 self.exclusions[char] = 1
1076
1077 widths = [None] * 0x110000
1078 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1079 widths[char] = width
1080
1081 for i in range(0, 0x110000):
1082 if table[i] is not None:
1083 table[i].east_asian_width = widths[i]
1084
1085 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1086 if table[char]:
1087 # Some properties (e.g. Default_Ignorable_Code_Point)
1088 # apply to unassigned code points; ignore them
1089 table[char].binary_properties.add(p)
1090
1091 for char_range, value in UcdFile(LINE_BREAK, version):
1092 if value not in MANDATORY_LINE_BREAKS:
1093 continue
1094 for char in expand_range(char_range):
1095 table[char].binary_properties.add('Line_Break')
1096
1097 # We only want the quickcheck properties
1098 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1099 # Yes is the default, hence only N and M occur
1100 # In 3.2.0, the format was different (NF?_NO)
1101 # The parsing will incorrectly determine these as
1102 # "yes", however, unicodedata.c will not perform quickchecks
1103 # for older versions, and no delta records will be created.
1104 quickchecks = [0] * 0x110000
1105 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1106 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1107 if len(s) < 2 or s[1] not in qc_order:
1108 continue
1109 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1110 quickcheck_shift = qc_order.index(s[1])*2
1111 quickcheck <<= quickcheck_shift
1112 for char in expand_range(s[0]):
1113 assert not (quickchecks[char]>>quickcheck_shift)&3
1114 quickchecks[char] |= quickcheck
1115 for i in range(0, 0x110000):
1116 if table[i] is not None:
1117 table[i].quick_check = quickchecks[i]
1118
1119 with open_data(UNIHAN, version) as file:
1120 zip = zipfile.ZipFile(file)
1121 if version == '3.2.0':
1122 data = zip.open('Unihan-3.2.0.txt').read()
1123 else:
1124 data = zip.open('Unihan_NumericValues.txt').read()
1125 for line in data.decode("utf-8").splitlines():
1126 if not line.startswith('U+'):
1127 continue
1128 code, tag, value = line.split(None, 3)[:3]
1129 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1130 'kOtherNumeric'):
1131 continue
1132 value = value.strip().replace(',', '')
1133 i = int(code[2:], 16)
1134 # Patch the numeric field
1135 if table[i] is not None:
1136 table[i].numeric_value = value
1137
1138 sc = self.special_casing = {}
1139 for data in UcdFile(SPECIAL_CASING, version):
1140 if data[4]:
1141 # We ignore all conditionals (since they depend on
1142 # languages) except for one, which is hardcoded. See
1143 # handle_capital_sigma in unicodeobject.c.
1144 continue
1145 c = int(data[0], 16)
1146 lower = [int(char, 16) for char in data[1].split()]
1147 title = [int(char, 16) for char in data[2].split()]
1148 upper = [int(char, 16) for char in data[3].split()]
1149 sc[c] = (lower, title, upper)
1150
1151 cf = self.case_folding = {}
1152 if version != '3.2.0':
1153 for data in UcdFile(CASE_FOLDING, version):
1154 if data[1] in "CF":
1155 c = int(data[0], 16)
1156 cf[c] = [int(char, 16) for char in data[2].split()]
1157
1158 def uselatin1(self):
1159 # restrict character range to ISO Latin 1
1160 self.chars = list(range(256))
1161
1162
1163 # hash table tools
1164
1165 # this is a straight-forward reimplementation of Python's built-in
1166 # dictionary type, using a static data structure, and a custom string
1167 # hash algorithm.
1168
1169 def myhash(s, magic):
1170 h = 0
1171 for c in map(ord, s.upper()):
1172 h = (h * magic) + c
1173 ix = h & 0xff000000
1174 if ix:
1175 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1176 return h
1177
1178
1179 SIZES = [
1180 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1181 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1182 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1183 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1184 ]
1185
1186
1187 class ESC[4;38;5;81mHash:
1188 def __init__(self, name, data, magic):
1189 # turn a (key, value) list into a static hash table structure
1190
1191 # determine table size
1192 for size, poly in SIZES:
1193 if size > len(data):
1194 poly = size + poly
1195 break
1196 else:
1197 raise AssertionError("ran out of polynomials")
1198
1199 print(size, "slots in hash table")
1200
1201 table = [None] * size
1202
1203 mask = size-1
1204
1205 n = 0
1206
1207 hash = myhash
1208
1209 # initialize hash table
1210 for key, value in data:
1211 h = hash(key, magic)
1212 i = (~h) & mask
1213 v = table[i]
1214 if v is None:
1215 table[i] = value
1216 continue
1217 incr = (h ^ (h >> 3)) & mask
1218 if not incr:
1219 incr = mask
1220 while 1:
1221 n = n + 1
1222 i = (i + incr) & mask
1223 v = table[i]
1224 if v is None:
1225 table[i] = value
1226 break
1227 incr = incr << 1
1228 if incr > mask:
1229 incr = incr ^ poly
1230
1231 print(n, "collisions")
1232 self.collisions = n
1233
1234 for i in range(len(table)):
1235 if table[i] is None:
1236 table[i] = 0
1237
1238 self.data = Array(name + "_hash", table)
1239 self.magic = magic
1240 self.name = name
1241 self.size = size
1242 self.poly = poly
1243
1244 def dump(self, file, trace):
1245 # write data to file, as a C array
1246 self.data.dump(file, trace)
1247 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1248 file.write("#define %s_size %d\n" % (self.name, self.size))
1249 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1250
1251
1252 # stuff to deal with arrays of unsigned integers
1253
1254 class ESC[4;38;5;81mArray:
1255
1256 def __init__(self, name, data):
1257 self.name = name
1258 self.data = data
1259
1260 def dump(self, file, trace=0):
1261 # write data to file, as a C array
1262 size = getsize(self.data)
1263 if trace:
1264 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1265 file.write("static const ")
1266 if size == 1:
1267 file.write("unsigned char")
1268 elif size == 2:
1269 file.write("unsigned short")
1270 else:
1271 file.write("unsigned int")
1272 file.write(" " + self.name + "[] = {\n")
1273 if self.data:
1274 s = " "
1275 for item in self.data:
1276 i = str(item) + ", "
1277 if len(s) + len(i) > 78:
1278 file.write(s.rstrip() + "\n")
1279 s = " " + i
1280 else:
1281 s = s + i
1282 if s.strip():
1283 file.write(s.rstrip() + "\n")
1284 file.write("};\n\n")
1285
1286
1287 def getsize(data):
1288 # return smallest possible integer size for the given array
1289 maxdata = max(data)
1290 if maxdata < 256:
1291 return 1
1292 elif maxdata < 65536:
1293 return 2
1294 else:
1295 return 4
1296
1297
1298 def splitbins(t, trace=0):
1299 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1300
1301 t is a sequence of ints. This function can be useful to save space if
1302 many of the ints are the same. t1 and t2 are lists of ints, and shift
1303 is an int, chosen to minimize the combined size of t1 and t2 (in C
1304 code), and where for each i in range(len(t)),
1305 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1306 where mask is a bitmask isolating the last "shift" bits.
1307
1308 If optional arg trace is non-zero (default zero), progress info
1309 is printed to sys.stderr. The higher the value, the more info
1310 you'll get.
1311 """
1312
1313 if trace:
1314 def dump(t1, t2, shift, bytes):
1315 print("%d+%d bins at shift %d; %d bytes" % (
1316 len(t1), len(t2), shift, bytes), file=sys.stderr)
1317 print("Size of original table:", len(t)*getsize(t), "bytes",
1318 file=sys.stderr)
1319 n = len(t)-1 # last valid index
1320 maxshift = 0 # the most we can shift n and still have something left
1321 if n > 0:
1322 while n >> 1:
1323 n >>= 1
1324 maxshift += 1
1325 del n
1326 bytes = sys.maxsize # smallest total size so far
1327 t = tuple(t) # so slices can be dict keys
1328 for shift in range(maxshift + 1):
1329 t1 = []
1330 t2 = []
1331 size = 2**shift
1332 bincache = {}
1333 for i in range(0, len(t), size):
1334 bin = t[i:i+size]
1335 index = bincache.get(bin)
1336 if index is None:
1337 index = len(t2)
1338 bincache[bin] = index
1339 t2.extend(bin)
1340 t1.append(index >> shift)
1341 # determine memory size
1342 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1343 if trace > 1:
1344 dump(t1, t2, shift, b)
1345 if b < bytes:
1346 best = t1, t2, shift
1347 bytes = b
1348 t1, t2, shift = best
1349 if trace:
1350 print("Best:", end=' ', file=sys.stderr)
1351 dump(t1, t2, shift, bytes)
1352 if __debug__:
1353 # exhaustively verify that the decomposition is correct
1354 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1355 for i in range(len(t)):
1356 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1357 return best
1358
1359
1360 if __name__ == "__main__":
1361 maketables(1)