1 #
2 # (re)generate unicode property and type databases
3 #
4 # This script converts Unicode database files to Modules/unicodedata_db.h,
5 # Modules/unicodename_db.h, and Objects/unicodetype_db.h
6 #
7 # history:
8 # 2000-09-24 fl created (based on bits and pieces from unidb)
9 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
10 # 2000-09-25 fl added character type table
11 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
12 # 2000-11-03 fl expand first/last ranges
13 # 2001-01-19 fl added character name tables (2.1)
14 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
15 # 2002-09-11 wd use string methods
16 # 2002-10-18 mvl update to Unicode 3.2
17 # 2002-10-22 mvl generate NFC tables
18 # 2002-11-24 mvl expand all ranges, sort names version-independently
19 # 2002-11-25 mvl add UNIDATA_VERSION
20 # 2004-05-29 perky add east asian width information
21 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
22 # 2008-06-11 gb add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
23 # 2011-10-21 ezio add support for name aliases and named sequences
24 # 2012-01 benjamin add full case mappings
25 #
26 # written by Fredrik Lundh (fredrik@pythonware.com)
27 #
28
29 import dataclasses
30 import os
31 import sys
32 import zipfile
33
34 from functools import partial
35 from textwrap import dedent
36 from typing import Iterator, List, Optional, Set, Tuple
37
38 SCRIPT = sys.argv[0]
39 VERSION = "3.3"
40
41 # The Unicode Database
42 # --------------------
43 # When changing UCD version please update
44 # * Doc/library/stdtypes.rst, and
45 # * Doc/library/unicodedata.rst
46 # * Doc/reference/lexical_analysis.rst (two occurrences)
47 UNIDATA_VERSION = "15.0.0"
48 UNICODE_DATA = "UnicodeData%s.txt"
49 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
50 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
51 UNIHAN = "Unihan%s.zip"
52 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
53 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
54 LINE_BREAK = "LineBreak%s.txt"
55 NAME_ALIASES = "NameAliases%s.txt"
56 NAMED_SEQUENCES = "NamedSequences%s.txt"
57 SPECIAL_CASING = "SpecialCasing%s.txt"
58 CASE_FOLDING = "CaseFolding%s.txt"
59
60 # Private Use Areas -- in planes 1, 15, 16
61 PUA_1 = range(0xE000, 0xF900)
62 PUA_15 = range(0xF0000, 0xFFFFE)
63 PUA_16 = range(0x100000, 0x10FFFE)
64
65 # we use this ranges of PUA_15 to store name aliases and named sequences
66 NAME_ALIASES_START = 0xF0000
67 NAMED_SEQUENCES_START = 0xF0200
68
69 old_versions = ["3.2.0"]
70
71 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
72 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
73 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
74 "So" ]
75
76 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
77 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
78 "ON", "LRI", "RLI", "FSI", "PDI" ]
79
80 # "N" needs to be the first entry, see the comment in makeunicodedata
81 EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
82
83 MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
84
85 # note: should match definitions in Objects/unicodectype.c
86 ALPHA_MASK = 0x01
87 DECIMAL_MASK = 0x02
88 DIGIT_MASK = 0x04
89 LOWER_MASK = 0x08
90 LINEBREAK_MASK = 0x10
91 SPACE_MASK = 0x20
92 TITLE_MASK = 0x40
93 UPPER_MASK = 0x80
94 XID_START_MASK = 0x100
95 XID_CONTINUE_MASK = 0x200
96 PRINTABLE_MASK = 0x400
97 NUMERIC_MASK = 0x800
98 CASE_IGNORABLE_MASK = 0x1000
99 CASED_MASK = 0x2000
100 EXTENDED_CASE_MASK = 0x4000
101
102 # these ranges need to match unicodedata.c:is_unified_ideograph
103 cjk_ranges = [
104 ('3400', '4DBF'),
105 ('4E00', '9FFF'),
106 ('20000', '2A6DF'),
107 ('2A700', '2B739'),
108 ('2B740', '2B81D'),
109 ('2B820', '2CEA1'),
110 ('2CEB0', '2EBE0'),
111 ('30000', '3134A'),
112 ('31350', '323AF'),
113 ]
114
115
116 def maketables(trace=0):
117
118 print("--- Reading", UNICODE_DATA % "", "...")
119
120 unicode = UnicodeData(UNIDATA_VERSION)
121
122 print(len(list(filter(None, unicode.table))), "characters")
123
124 for version in old_versions:
125 print("--- Reading", UNICODE_DATA % ("-"+version), "...")
126 old_unicode = UnicodeData(version, cjk_check=False)
127 print(len(list(filter(None, old_unicode.table))), "characters")
128 merge_old_version(version, unicode, old_unicode)
129
130 makeunicodename(unicode, trace)
131 makeunicodedata(unicode, trace)
132 makeunicodetype(unicode, trace)
133
134
135 # --------------------------------------------------------------------
136 # unicode character properties
137
138 def makeunicodedata(unicode, trace):
139
140 # the default value of east_asian_width is "N", for unassigned code points
141 # not mentioned in EastAsianWidth.txt
142 # in addition there are some reserved but unassigned code points in CJK
143 # ranges that are classified as "W". code points in private use areas
144 # have a width of "A". both of these have entries in
145 # EastAsianWidth.txt
146 # see https://unicode.org/reports/tr11/#Unassigned
147 assert EASTASIANWIDTH_NAMES[0] == "N"
148 dummy = (0, 0, 0, 0, 0, 0)
149 table = [dummy]
150 cache = {0: dummy}
151 index = [0] * len(unicode.chars)
152
153 FILE = "Modules/unicodedata_db.h"
154
155 print("--- Preparing", FILE, "...")
156
157 # 1) database properties
158
159 for char in unicode.chars:
160 record = unicode.table[char]
161 if record:
162 # extract database properties
163 category = CATEGORY_NAMES.index(record.general_category)
164 combining = int(record.canonical_combining_class)
165 bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
166 mirrored = record.bidi_mirrored == "Y"
167 eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
168 normalizationquickcheck = record.quick_check
169 item = (
170 category, combining, bidirectional, mirrored, eastasianwidth,
171 normalizationquickcheck
172 )
173 elif unicode.widths[char] is not None:
174 # an unassigned but reserved character, with a known
175 # east_asian_width
176 eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
177 item = (0, 0, 0, 0, eastasianwidth, 0)
178 else:
179 continue
180
181 # add entry to index and item tables
182 i = cache.get(item)
183 if i is None:
184 cache[item] = i = len(table)
185 table.append(item)
186 index[char] = i
187
188 # 2) decomposition data
189
190 decomp_data_cache = {}
191 decomp_data = [0]
192 decomp_prefix = [""]
193 decomp_index = [0] * len(unicode.chars)
194 decomp_size = 0
195
196 comp_pairs = []
197 comp_first = [None] * len(unicode.chars)
198 comp_last = [None] * len(unicode.chars)
199
200 for char in unicode.chars:
201 record = unicode.table[char]
202 if record:
203 if record.decomposition_type:
204 decomp = record.decomposition_type.split()
205 if len(decomp) > 19:
206 raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
207 # prefix
208 if decomp[0][0] == "<":
209 prefix = decomp.pop(0)
210 else:
211 prefix = ""
212 try:
213 i = decomp_prefix.index(prefix)
214 except ValueError:
215 i = len(decomp_prefix)
216 decomp_prefix.append(prefix)
217 prefix = i
218 assert prefix < 256
219 # content
220 decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
221 # Collect NFC pairs
222 if not prefix and len(decomp) == 3 and \
223 char not in unicode.exclusions and \
224 unicode.table[decomp[1]].canonical_combining_class == "0":
225 p, l, r = decomp
226 comp_first[l] = 1
227 comp_last[r] = 1
228 comp_pairs.append((l,r,char))
229 key = tuple(decomp)
230 i = decomp_data_cache.get(key, -1)
231 if i == -1:
232 i = len(decomp_data)
233 decomp_data.extend(decomp)
234 decomp_size = decomp_size + len(decomp) * 2
235 decomp_data_cache[key] = i
236 else:
237 assert decomp_data[i:i+len(decomp)] == decomp
238 else:
239 i = 0
240 decomp_index[char] = i
241
242 f = l = 0
243 comp_first_ranges = []
244 comp_last_ranges = []
245 prev_f = prev_l = None
246 for i in unicode.chars:
247 if comp_first[i] is not None:
248 comp_first[i] = f
249 f += 1
250 if prev_f is None:
251 prev_f = (i,i)
252 elif prev_f[1]+1 == i:
253 prev_f = prev_f[0],i
254 else:
255 comp_first_ranges.append(prev_f)
256 prev_f = (i,i)
257 if comp_last[i] is not None:
258 comp_last[i] = l
259 l += 1
260 if prev_l is None:
261 prev_l = (i,i)
262 elif prev_l[1]+1 == i:
263 prev_l = prev_l[0],i
264 else:
265 comp_last_ranges.append(prev_l)
266 prev_l = (i,i)
267 comp_first_ranges.append(prev_f)
268 comp_last_ranges.append(prev_l)
269 total_first = f
270 total_last = l
271
272 comp_data = [0]*(total_first*total_last)
273 for f,l,char in comp_pairs:
274 f = comp_first[f]
275 l = comp_last[l]
276 comp_data[f*total_last+l] = char
277
278 print(len(table), "unique properties")
279 print(len(decomp_prefix), "unique decomposition prefixes")
280 print(len(decomp_data), "unique decomposition entries:", end=' ')
281 print(decomp_size, "bytes")
282 print(total_first, "first characters in NFC")
283 print(total_last, "last characters in NFC")
284 print(len(comp_pairs), "NFC pairs")
285
286 print("--- Writing", FILE, "...")
287
288 with open(FILE, "w") as fp:
289 fprint = partial(print, file=fp)
290
291 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
292 fprint()
293 fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
294 fprint("/* a list of unique database records */")
295 fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
296 for item in table:
297 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
298 fprint("};")
299 fprint()
300
301 fprint("/* Reindexing of NFC first characters. */")
302 fprint("#define TOTAL_FIRST",total_first)
303 fprint("#define TOTAL_LAST",total_last)
304 fprint("struct reindex{int start;short count,index;};")
305 fprint("static struct reindex nfc_first[] = {")
306 for start,end in comp_first_ranges:
307 fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
308 fprint(" {0,0,0}")
309 fprint("};\n")
310 fprint("static struct reindex nfc_last[] = {")
311 for start,end in comp_last_ranges:
312 fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
313 fprint(" {0,0,0}")
314 fprint("};\n")
315
316 # FIXME: <fl> the following tables could be made static, and
317 # the support code moved into unicodedatabase.c
318
319 fprint("/* string literals */")
320 fprint("const char *_PyUnicode_CategoryNames[] = {")
321 for name in CATEGORY_NAMES:
322 fprint(" \"%s\"," % name)
323 fprint(" NULL")
324 fprint("};")
325
326 fprint("const char *_PyUnicode_BidirectionalNames[] = {")
327 for name in BIDIRECTIONAL_NAMES:
328 fprint(" \"%s\"," % name)
329 fprint(" NULL")
330 fprint("};")
331
332 fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
333 for name in EASTASIANWIDTH_NAMES:
334 fprint(" \"%s\"," % name)
335 fprint(" NULL")
336 fprint("};")
337
338 fprint("static const char *decomp_prefix[] = {")
339 for name in decomp_prefix:
340 fprint(" \"%s\"," % name)
341 fprint(" NULL")
342 fprint("};")
343
344 # split record index table
345 index1, index2, shift = splitbins(index, trace)
346
347 fprint("/* index tables for the database records */")
348 fprint("#define SHIFT", shift)
349 Array("index1", index1).dump(fp, trace)
350 Array("index2", index2).dump(fp, trace)
351
352 # split decomposition index table
353 index1, index2, shift = splitbins(decomp_index, trace)
354
355 fprint("/* decomposition data */")
356 Array("decomp_data", decomp_data).dump(fp, trace)
357
358 fprint("/* index tables for the decomposition data */")
359 fprint("#define DECOMP_SHIFT", shift)
360 Array("decomp_index1", index1).dump(fp, trace)
361 Array("decomp_index2", index2).dump(fp, trace)
362
363 index, index2, shift = splitbins(comp_data, trace)
364 fprint("/* NFC pairs */")
365 fprint("#define COMP_SHIFT", shift)
366 Array("comp_index", index).dump(fp, trace)
367 Array("comp_data", index2).dump(fp, trace)
368
369 # Generate delta tables for old versions
370 for version, table, normalization in unicode.changed:
371 cversion = version.replace(".","_")
372 records = [table[0]]
373 cache = {table[0]:0}
374 index = [0] * len(table)
375 for i, record in enumerate(table):
376 try:
377 index[i] = cache[record]
378 except KeyError:
379 index[i] = cache[record] = len(records)
380 records.append(record)
381 index1, index2, shift = splitbins(index, trace)
382 fprint("static const change_record change_records_%s[] = {" % cversion)
383 for record in records:
384 fprint(" { %s }," % ", ".join(map(str,record)))
385 fprint("};")
386 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
387 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
388 fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
389 fprint("{")
390 fprint(" int index;")
391 fprint(" if (n >= 0x110000) index = 0;")
392 fprint(" else {")
393 fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
394 fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
395 (cversion, shift, ((1<<shift)-1)))
396 fprint(" }")
397 fprint(" return change_records_%s+index;" % cversion)
398 fprint("}\n")
399 fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
400 fprint("{")
401 fprint(" switch(n) {")
402 for k, v in normalization:
403 fprint(" case %s: return 0x%s;" % (hex(k), v))
404 fprint(" default: return 0;")
405 fprint(" }\n}\n")
406
407
408 # --------------------------------------------------------------------
409 # unicode character type tables
410
411 def makeunicodetype(unicode, trace):
412
413 FILE = "Objects/unicodetype_db.h"
414
415 print("--- Preparing", FILE, "...")
416
417 # extract unicode types
418 dummy = (0, 0, 0, 0, 0, 0)
419 table = [dummy]
420 cache = {dummy: 0}
421 index = [0] * len(unicode.chars)
422 numeric = {}
423 spaces = []
424 linebreaks = []
425 extra_casing = []
426
427 for char in unicode.chars:
428 record = unicode.table[char]
429 if record:
430 # extract database properties
431 category = record.general_category
432 bidirectional = record.bidi_class
433 properties = record.binary_properties
434 flags = 0
435 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
436 flags |= ALPHA_MASK
437 if "Lowercase" in properties:
438 flags |= LOWER_MASK
439 if 'Line_Break' in properties or bidirectional == "B":
440 flags |= LINEBREAK_MASK
441 linebreaks.append(char)
442 if category == "Zs" or bidirectional in ("WS", "B", "S"):
443 flags |= SPACE_MASK
444 spaces.append(char)
445 if category == "Lt":
446 flags |= TITLE_MASK
447 if "Uppercase" in properties:
448 flags |= UPPER_MASK
449 if char == ord(" ") or category[0] not in ("C", "Z"):
450 flags |= PRINTABLE_MASK
451 if "XID_Start" in properties:
452 flags |= XID_START_MASK
453 if "XID_Continue" in properties:
454 flags |= XID_CONTINUE_MASK
455 if "Cased" in properties:
456 flags |= CASED_MASK
457 if "Case_Ignorable" in properties:
458 flags |= CASE_IGNORABLE_MASK
459 sc = unicode.special_casing.get(char)
460 cf = unicode.case_folding.get(char, [char])
461 if record.simple_uppercase_mapping:
462 upper = int(record.simple_uppercase_mapping, 16)
463 else:
464 upper = char
465 if record.simple_lowercase_mapping:
466 lower = int(record.simple_lowercase_mapping, 16)
467 else:
468 lower = char
469 if record.simple_titlecase_mapping:
470 title = int(record.simple_titlecase_mapping, 16)
471 else:
472 title = upper
473 if sc is None and cf != [lower]:
474 sc = ([lower], [title], [upper])
475 if sc is None:
476 if upper == lower == title:
477 upper = lower = title = 0
478 else:
479 upper = upper - char
480 lower = lower - char
481 title = title - char
482 assert (abs(upper) <= 2147483647 and
483 abs(lower) <= 2147483647 and
484 abs(title) <= 2147483647)
485 else:
486 # This happens either when some character maps to more than one
487 # character in uppercase, lowercase, or titlecase or the
488 # casefolded version of the character is different from the
489 # lowercase. The extra characters are stored in a different
490 # array.
491 flags |= EXTENDED_CASE_MASK
492 lower = len(extra_casing) | (len(sc[0]) << 24)
493 extra_casing.extend(sc[0])
494 if cf != sc[0]:
495 lower |= len(cf) << 20
496 extra_casing.extend(cf)
497 upper = len(extra_casing) | (len(sc[2]) << 24)
498 extra_casing.extend(sc[2])
499 # Title is probably equal to upper.
500 if sc[1] == sc[2]:
501 title = upper
502 else:
503 title = len(extra_casing) | (len(sc[1]) << 24)
504 extra_casing.extend(sc[1])
505 # decimal digit, integer digit
506 decimal = 0
507 if record.decomposition_mapping:
508 flags |= DECIMAL_MASK
509 decimal = int(record.decomposition_mapping)
510 digit = 0
511 if record.numeric_type:
512 flags |= DIGIT_MASK
513 digit = int(record.numeric_type)
514 if record.numeric_value:
515 flags |= NUMERIC_MASK
516 numeric.setdefault(record.numeric_value, []).append(char)
517 item = (
518 upper, lower, title, decimal, digit, flags
519 )
520 # add entry to index and item tables
521 i = cache.get(item)
522 if i is None:
523 cache[item] = i = len(table)
524 table.append(item)
525 index[char] = i
526
527 print(len(table), "unique character type entries")
528 print(sum(map(len, numeric.values())), "numeric code points")
529 print(len(spaces), "whitespace code points")
530 print(len(linebreaks), "linebreak code points")
531 print(len(extra_casing), "extended case array")
532
533 print("--- Writing", FILE, "...")
534
535 with open(FILE, "w") as fp:
536 fprint = partial(print, file=fp)
537
538 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
539 fprint()
540 fprint("/* a list of unique character type descriptors */")
541 fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
542 for item in table:
543 fprint(" {%d, %d, %d, %d, %d, %d}," % item)
544 fprint("};")
545 fprint()
546
547 fprint("/* extended case mappings */")
548 fprint()
549 fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
550 for c in extra_casing:
551 fprint(" %d," % c)
552 fprint("};")
553 fprint()
554
555 # split decomposition index table
556 index1, index2, shift = splitbins(index, trace)
557
558 fprint("/* type indexes */")
559 fprint("#define SHIFT", shift)
560 Array("index1", index1).dump(fp, trace)
561 Array("index2", index2).dump(fp, trace)
562
563 # Generate code for _PyUnicode_ToNumeric()
564 numeric_items = sorted(numeric.items())
565 fprint('/* Returns the numeric value as double for Unicode characters')
566 fprint(' * having this property, -1.0 otherwise.')
567 fprint(' */')
568 fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
569 fprint('{')
570 fprint(' switch (ch) {')
571 for value, codepoints in numeric_items:
572 # Turn text into float literals
573 parts = value.split('/')
574 parts = [repr(float(part)) for part in parts]
575 value = '/'.join(parts)
576
577 codepoints.sort()
578 for codepoint in codepoints:
579 fprint(' case 0x%04X:' % (codepoint,))
580 fprint(' return (double) %s;' % (value,))
581 fprint(' }')
582 fprint(' return -1.0;')
583 fprint('}')
584 fprint()
585
586 # Generate code for _PyUnicode_IsWhitespace()
587 fprint("/* Returns 1 for Unicode characters having the bidirectional")
588 fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
589 fprint(" */")
590 fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
591 fprint('{')
592 fprint(' switch (ch) {')
593
594 for codepoint in sorted(spaces):
595 fprint(' case 0x%04X:' % (codepoint,))
596 fprint(' return 1;')
597
598 fprint(' }')
599 fprint(' return 0;')
600 fprint('}')
601 fprint()
602
603 # Generate code for _PyUnicode_IsLinebreak()
604 fprint("/* Returns 1 for Unicode characters having the line break")
605 fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
606 fprint(" * type 'B', 0 otherwise.")
607 fprint(" */")
608 fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
609 fprint('{')
610 fprint(' switch (ch) {')
611 for codepoint in sorted(linebreaks):
612 fprint(' case 0x%04X:' % (codepoint,))
613 fprint(' return 1;')
614
615 fprint(' }')
616 fprint(' return 0;')
617 fprint('}')
618 fprint()
619
620
621 # --------------------------------------------------------------------
622 # unicode name database
623
624 def makeunicodename(unicode, trace):
625
626 FILE = "Modules/unicodename_db.h"
627
628 print("--- Preparing", FILE, "...")
629
630 # collect names
631 names = [None] * len(unicode.chars)
632
633 for char in unicode.chars:
634 record = unicode.table[char]
635 if record:
636 name = record.name.strip()
637 if name and name[0] != "<":
638 names[char] = name + chr(0)
639
640 print(len([n for n in names if n is not None]), "distinct names")
641
642 # collect unique words from names (note that we differ between
643 # words inside a sentence, and words ending a sentence. the
644 # latter includes the trailing null byte.
645
646 words = {}
647 n = b = 0
648 for char in unicode.chars:
649 name = names[char]
650 if name:
651 w = name.split()
652 b = b + len(name)
653 n = n + len(w)
654 for w in w:
655 l = words.get(w)
656 if l:
657 l.append(None)
658 else:
659 words[w] = [len(words)]
660
661 print(n, "words in text;", b, "bytes")
662
663 wordlist = list(words.items())
664
665 # sort on falling frequency, then by name
666 def word_key(a):
667 aword, alist = a
668 return -len(alist), aword
669 wordlist.sort(key=word_key)
670
671 # figure out how many phrasebook escapes we need
672 escapes = 0
673 while escapes * 256 < len(wordlist):
674 escapes = escapes + 1
675 print(escapes, "escapes")
676
677 short = 256 - escapes
678
679 assert short > 0
680
681 print(short, "short indexes in lexicon")
682
683 # statistics
684 n = 0
685 for i in range(short):
686 n = n + len(wordlist[i][1])
687 print(n, "short indexes in phrasebook")
688
689 # pick the most commonly used words, and sort the rest on falling
690 # length (to maximize overlap)
691
692 wordlist, wordtail = wordlist[:short], wordlist[short:]
693 wordtail.sort(key=lambda a: a[0], reverse=True)
694 wordlist.extend(wordtail)
695
696 # generate lexicon from words
697
698 lexicon_offset = [0]
699 lexicon = ""
700 words = {}
701
702 # build a lexicon string
703 offset = 0
704 for w, x in wordlist:
705 # encoding: bit 7 indicates last character in word (chr(128)
706 # indicates the last character in an entire string)
707 ww = w[:-1] + chr(ord(w[-1])+128)
708 # reuse string tails, when possible
709 o = lexicon.find(ww)
710 if o < 0:
711 o = offset
712 lexicon = lexicon + ww
713 offset = offset + len(w)
714 words[w] = len(lexicon_offset)
715 lexicon_offset.append(o)
716
717 lexicon = list(map(ord, lexicon))
718
719 # generate phrasebook from names and lexicon
720 phrasebook = [0]
721 phrasebook_offset = [0] * len(unicode.chars)
722 for char in unicode.chars:
723 name = names[char]
724 if name:
725 w = name.split()
726 phrasebook_offset[char] = len(phrasebook)
727 for w in w:
728 i = words[w]
729 if i < short:
730 phrasebook.append(i)
731 else:
732 # store as two bytes
733 phrasebook.append((i>>8) + short)
734 phrasebook.append(i&255)
735
736 assert getsize(phrasebook) == 1
737
738 #
739 # unicode name hash table
740
741 # extract names
742 data = []
743 for char in unicode.chars:
744 record = unicode.table[char]
745 if record:
746 name = record.name.strip()
747 if name and name[0] != "<":
748 data.append((name, char))
749
750 # the magic number 47 was chosen to minimize the number of
751 # collisions on the current data set. if you like, change it
752 # and see what happens...
753
754 codehash = Hash("code", data, 47)
755
756 print("--- Writing", FILE, "...")
757
758 with open(FILE, "w") as fp:
759 fprint = partial(print, file=fp)
760
761 fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
762 fprint()
763 fprint("#define NAME_MAXLEN", 256)
764 fprint()
765 fprint("/* lexicon */")
766 Array("lexicon", lexicon).dump(fp, trace)
767 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
768
769 # split decomposition index table
770 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
771
772 fprint("/* code->name phrasebook */")
773 fprint("#define phrasebook_shift", shift)
774 fprint("#define phrasebook_short", short)
775
776 Array("phrasebook", phrasebook).dump(fp, trace)
777 Array("phrasebook_offset1", offset1).dump(fp, trace)
778 Array("phrasebook_offset2", offset2).dump(fp, trace)
779
780 fprint("/* name->code dictionary */")
781 codehash.dump(fp, trace)
782
783 fprint()
784 fprint('static const unsigned int aliases_start = %#x;' %
785 NAME_ALIASES_START)
786 fprint('static const unsigned int aliases_end = %#x;' %
787 (NAME_ALIASES_START + len(unicode.aliases)))
788
789 fprint('static const unsigned int name_aliases[] = {')
790 for name, codepoint in unicode.aliases:
791 fprint(' 0x%04X,' % codepoint)
792 fprint('};')
793
794 # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
795 # so we are using Py_UCS2 seq[4]. This needs to be updated if longer
796 # sequences or sequences with non-BMP chars are added.
797 # unicodedata_lookup should be adapted too.
798 fprint(dedent("""
799 typedef struct NamedSequence {
800 int seqlen;
801 Py_UCS2 seq[4];
802 } named_sequence;
803 """))
804
805 fprint('static const unsigned int named_sequences_start = %#x;' %
806 NAMED_SEQUENCES_START)
807 fprint('static const unsigned int named_sequences_end = %#x;' %
808 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
809
810 fprint('static const named_sequence named_sequences[] = {')
811 for name, sequence in unicode.named_sequences:
812 seq_str = ', '.join('0x%04X' % cp for cp in sequence)
813 fprint(' {%d, {%s}},' % (len(sequence), seq_str))
814 fprint('};')
815
816
817 def merge_old_version(version, new, old):
818 # Changes to exclusion file not implemented yet
819 if old.exclusions != new.exclusions:
820 raise NotImplementedError("exclusions differ")
821
822 # In these change records, 0xFF means "no change"
823 bidir_changes = [0xFF]*0x110000
824 category_changes = [0xFF]*0x110000
825 decimal_changes = [0xFF]*0x110000
826 mirrored_changes = [0xFF]*0x110000
827 east_asian_width_changes = [0xFF]*0x110000
828 # In numeric data, 0 means "no change",
829 # -1 means "did not have a numeric value
830 numeric_changes = [0] * 0x110000
831 # normalization_changes is a list of key-value pairs
832 normalization_changes = []
833 for i in range(0x110000):
834 if new.table[i] is None:
835 # Characters unassigned in the new version ought to
836 # be unassigned in the old one
837 assert old.table[i] is None
838 continue
839 # check characters unassigned in the old version
840 if old.table[i] is None:
841 # category 0 is "unassigned"
842 category_changes[i] = 0
843 continue
844 # check characters that differ
845 if old.table[i] != new.table[i]:
846 for k, field in enumerate(dataclasses.fields(UcdRecord)):
847 value = getattr(old.table[i], field.name)
848 new_value = getattr(new.table[i], field.name)
849 if value != new_value:
850 if k == 1 and i in PUA_15:
851 # the name is not set in the old.table, but in the
852 # new.table we are using it for aliases and named seq
853 assert value == ''
854 elif k == 2:
855 category_changes[i] = CATEGORY_NAMES.index(value)
856 elif k == 4:
857 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
858 elif k == 5:
859 # We assume that all normalization changes are in 1:1 mappings
860 assert " " not in value
861 normalization_changes.append((i, value))
862 elif k == 6:
863 # we only support changes where the old value is a single digit
864 assert value in "0123456789"
865 decimal_changes[i] = int(value)
866 elif k == 8:
867 # Since 0 encodes "no change", the old value is better not 0
868 if not value:
869 numeric_changes[i] = -1
870 else:
871 numeric_changes[i] = float(value)
872 assert numeric_changes[i] not in (0, -1)
873 elif k == 9:
874 if value == 'Y':
875 mirrored_changes[i] = '1'
876 else:
877 mirrored_changes[i] = '0'
878 elif k == 11:
879 # change to ISO comment, ignore
880 pass
881 elif k == 12:
882 # change to simple uppercase mapping; ignore
883 pass
884 elif k == 13:
885 # change to simple lowercase mapping; ignore
886 pass
887 elif k == 14:
888 # change to simple titlecase mapping; ignore
889 pass
890 elif k == 15:
891 # change to east asian width
892 east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
893 elif k == 16:
894 # derived property changes; not yet
895 pass
896 elif k == 17:
897 # normalization quickchecks are not performed
898 # for older versions
899 pass
900 else:
901 class ESC[4;38;5;81mDifference(ESC[4;38;5;149mException):pass
902 raise Difference(hex(i), k, old.table[i], new.table[i])
903 new.changed.append((version, list(zip(bidir_changes, category_changes,
904 decimal_changes, mirrored_changes,
905 east_asian_width_changes,
906 numeric_changes)),
907 normalization_changes))
908
909
910 DATA_DIR = os.path.join('Tools', 'unicode', 'data')
911
912 def open_data(template, version):
913 local = os.path.join(DATA_DIR, template % ('-'+version,))
914 if not os.path.exists(local):
915 import urllib.request
916 if version == '3.2.0':
917 # irregular url structure
918 url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
919 else:
920 url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
921 os.makedirs(DATA_DIR, exist_ok=True)
922 urllib.request.urlretrieve(url, filename=local)
923 if local.endswith('.txt'):
924 return open(local, encoding='utf-8')
925 else:
926 # Unihan.zip
927 return open(local, 'rb')
928
929
930 def expand_range(char_range: str) -> Iterator[int]:
931 '''
932 Parses ranges of code points, as described in UAX #44:
933 https://www.unicode.org/reports/tr44/#Code_Point_Ranges
934 '''
935 if '..' in char_range:
936 first, last = [int(c, 16) for c in char_range.split('..')]
937 else:
938 first = last = int(char_range, 16)
939 for char in range(first, last+1):
940 yield char
941
942
943 class ESC[4;38;5;81mUcdFile:
944 '''
945 A file in the standard format of the UCD.
946
947 See: https://www.unicode.org/reports/tr44/#Format_Conventions
948
949 Note that, as described there, the Unihan data files have their
950 own separate format.
951 '''
952
953 def __init__(self, template: str, version: str) -> None:
954 self.template = template
955 self.version = version
956
957 def records(self) -> Iterator[List[str]]:
958 with open_data(self.template, self.version) as file:
959 for line in file:
960 line = line.split('#', 1)[0].strip()
961 if not line:
962 continue
963 yield [field.strip() for field in line.split(';')]
964
965 def __iter__(self) -> Iterator[List[str]]:
966 return self.records()
967
968 def expanded(self) -> Iterator[Tuple[int, List[str]]]:
969 for record in self.records():
970 char_range, rest = record[0], record[1:]
971 for char in expand_range(char_range):
972 yield char, rest
973
974
975 @dataclasses.dataclass
976 class ESC[4;38;5;81mUcdRecord:
977 # 15 fields from UnicodeData.txt . See:
978 # https://www.unicode.org/reports/tr44/#UnicodeData.txt
979 codepoint: str
980 name: str
981 general_category: str
982 canonical_combining_class: str
983 bidi_class: str
984 decomposition_type: str
985 decomposition_mapping: str
986 numeric_type: str
987 numeric_value: str
988 bidi_mirrored: str
989 unicode_1_name: str # obsolete
990 iso_comment: str # obsolete
991 simple_uppercase_mapping: str
992 simple_lowercase_mapping: str
993 simple_titlecase_mapping: str
994
995 # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
996 east_asian_width: Optional[str]
997
998 # Binary properties, as a set of those that are true.
999 # Taken from multiple files:
1000 # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
1001 # https://www.unicode.org/reports/tr44/#LineBreak.txt
1002 binary_properties: Set[str]
1003
1004 # The Quick_Check properties related to normalization:
1005 # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
1006 # We store them as a bitmask.
1007 quick_check: int
1008
1009
1010 def from_row(row: List[str]) -> UcdRecord:
1011 return UcdRecord(*row, None, set(), 0)
1012
1013
1014 # --------------------------------------------------------------------
1015 # the following support code is taken from the unidb utilities
1016 # Copyright (c) 1999-2000 by Secret Labs AB
1017
1018 # load a unicode-data file from disk
1019
1020 class ESC[4;38;5;81mUnicodeData:
1021 # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
1022
1023 def __init__(self, version, cjk_check=True):
1024 self.changed = []
1025 table = [None] * 0x110000
1026 for s in UcdFile(UNICODE_DATA, version):
1027 char = int(s[0], 16)
1028 table[char] = from_row(s)
1029
1030 cjk_ranges_found = []
1031
1032 # expand first-last ranges
1033 field = None
1034 for i in range(0, 0x110000):
1035 # The file UnicodeData.txt has its own distinct way of
1036 # expressing ranges. See:
1037 # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
1038 s = table[i]
1039 if s:
1040 if s.name[-6:] == "First>":
1041 s.name = ""
1042 field = dataclasses.astuple(s)[:15]
1043 elif s.name[-5:] == "Last>":
1044 if s.name.startswith("<CJK Ideograph"):
1045 cjk_ranges_found.append((field[0],
1046 s.codepoint))
1047 s.name = ""
1048 field = None
1049 elif field:
1050 table[i] = from_row(('%X' % i,) + field[1:])
1051 if cjk_check and cjk_ranges != cjk_ranges_found:
1052 raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
1053
1054 # public attributes
1055 self.filename = UNICODE_DATA % ''
1056 self.table = table
1057 self.chars = list(range(0x110000)) # unicode 3.2
1058
1059 # check for name aliases and named sequences, see #12753
1060 # aliases and named sequences are not in 3.2.0
1061 if version != '3.2.0':
1062 self.aliases = []
1063 # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
1064 # in order to take advantage of the compression and lookup
1065 # algorithms used for the other characters
1066 pua_index = NAME_ALIASES_START
1067 for char, name, abbrev in UcdFile(NAME_ALIASES, version):
1068 char = int(char, 16)
1069 self.aliases.append((name, char))
1070 # also store the name in the PUA 1
1071 self.table[pua_index].name = name
1072 pua_index += 1
1073 assert pua_index - NAME_ALIASES_START == len(self.aliases)
1074
1075 self.named_sequences = []
1076 # store named sequences in the PUA 1, in range U+F0100..,
1077 # in order to take advantage of the compression and lookup
1078 # algorithms used for the other characters.
1079
1080 assert pua_index < NAMED_SEQUENCES_START
1081 pua_index = NAMED_SEQUENCES_START
1082 for name, chars in UcdFile(NAMED_SEQUENCES, version):
1083 chars = tuple(int(char, 16) for char in chars.split())
1084 # check that the structure defined in makeunicodename is OK
1085 assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1086 assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1087 "the NamedSequence struct and in unicodedata_lookup")
1088 self.named_sequences.append((name, chars))
1089 # also store these in the PUA 1
1090 self.table[pua_index].name = name
1091 pua_index += 1
1092 assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
1093
1094 self.exclusions = {}
1095 for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1096 char = int(char, 16)
1097 self.exclusions[char] = 1
1098
1099 widths = [None] * 0x110000
1100 for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1101 widths[char] = width
1102
1103 for i in range(0, 0x110000):
1104 if table[i] is not None:
1105 table[i].east_asian_width = widths[i]
1106 self.widths = widths
1107
1108 for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1109 if table[char]:
1110 # Some properties (e.g. Default_Ignorable_Code_Point)
1111 # apply to unassigned code points; ignore them
1112 table[char].binary_properties.add(p)
1113
1114 for char_range, value in UcdFile(LINE_BREAK, version):
1115 if value not in MANDATORY_LINE_BREAKS:
1116 continue
1117 for char in expand_range(char_range):
1118 table[char].binary_properties.add('Line_Break')
1119
1120 # We only want the quickcheck properties
1121 # Format: NF?_QC; Y(es)/N(o)/M(aybe)
1122 # Yes is the default, hence only N and M occur
1123 # In 3.2.0, the format was different (NF?_NO)
1124 # The parsing will incorrectly determine these as
1125 # "yes", however, unicodedata.c will not perform quickchecks
1126 # for older versions, and no delta records will be created.
1127 quickchecks = [0] * 0x110000
1128 qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1129 for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1130 if len(s) < 2 or s[1] not in qc_order:
1131 continue
1132 quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1133 quickcheck_shift = qc_order.index(s[1])*2
1134 quickcheck <<= quickcheck_shift
1135 for char in expand_range(s[0]):
1136 assert not (quickchecks[char]>>quickcheck_shift)&3
1137 quickchecks[char] |= quickcheck
1138 for i in range(0, 0x110000):
1139 if table[i] is not None:
1140 table[i].quick_check = quickchecks[i]
1141
1142 with open_data(UNIHAN, version) as file:
1143 zip = zipfile.ZipFile(file)
1144 if version == '3.2.0':
1145 data = zip.open('Unihan-3.2.0.txt').read()
1146 else:
1147 data = zip.open('Unihan_NumericValues.txt').read()
1148 for line in data.decode("utf-8").splitlines():
1149 if not line.startswith('U+'):
1150 continue
1151 code, tag, value = line.split(None, 3)[:3]
1152 if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
1153 'kOtherNumeric'):
1154 continue
1155 value = value.strip().replace(',', '')
1156 i = int(code[2:], 16)
1157 # Patch the numeric field
1158 if table[i] is not None:
1159 table[i].numeric_value = value
1160
1161 sc = self.special_casing = {}
1162 for data in UcdFile(SPECIAL_CASING, version):
1163 if data[4]:
1164 # We ignore all conditionals (since they depend on
1165 # languages) except for one, which is hardcoded. See
1166 # handle_capital_sigma in unicodeobject.c.
1167 continue
1168 c = int(data[0], 16)
1169 lower = [int(char, 16) for char in data[1].split()]
1170 title = [int(char, 16) for char in data[2].split()]
1171 upper = [int(char, 16) for char in data[3].split()]
1172 sc[c] = (lower, title, upper)
1173
1174 cf = self.case_folding = {}
1175 if version != '3.2.0':
1176 for data in UcdFile(CASE_FOLDING, version):
1177 if data[1] in "CF":
1178 c = int(data[0], 16)
1179 cf[c] = [int(char, 16) for char in data[2].split()]
1180
1181 def uselatin1(self):
1182 # restrict character range to ISO Latin 1
1183 self.chars = list(range(256))
1184
1185
1186 # hash table tools
1187
1188 # this is a straight-forward reimplementation of Python's built-in
1189 # dictionary type, using a static data structure, and a custom string
1190 # hash algorithm.
1191
1192 def myhash(s, magic):
1193 h = 0
1194 for c in map(ord, s.upper()):
1195 h = (h * magic) + c
1196 ix = h & 0xff000000
1197 if ix:
1198 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
1199 return h
1200
1201
1202 SIZES = [
1203 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
1204 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
1205 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
1206 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
1207 ]
1208
1209
1210 class ESC[4;38;5;81mHash:
1211 def __init__(self, name, data, magic):
1212 # turn a (key, value) list into a static hash table structure
1213
1214 # determine table size
1215 for size, poly in SIZES:
1216 if size > len(data):
1217 poly = size + poly
1218 break
1219 else:
1220 raise AssertionError("ran out of polynomials")
1221
1222 print(size, "slots in hash table")
1223
1224 table = [None] * size
1225
1226 mask = size-1
1227
1228 n = 0
1229
1230 hash = myhash
1231
1232 # initialize hash table
1233 for key, value in data:
1234 h = hash(key, magic)
1235 i = (~h) & mask
1236 v = table[i]
1237 if v is None:
1238 table[i] = value
1239 continue
1240 incr = (h ^ (h >> 3)) & mask
1241 if not incr:
1242 incr = mask
1243 while 1:
1244 n = n + 1
1245 i = (i + incr) & mask
1246 v = table[i]
1247 if v is None:
1248 table[i] = value
1249 break
1250 incr = incr << 1
1251 if incr > mask:
1252 incr = incr ^ poly
1253
1254 print(n, "collisions")
1255 self.collisions = n
1256
1257 for i in range(len(table)):
1258 if table[i] is None:
1259 table[i] = 0
1260
1261 self.data = Array(name + "_hash", table)
1262 self.magic = magic
1263 self.name = name
1264 self.size = size
1265 self.poly = poly
1266
1267 def dump(self, file, trace):
1268 # write data to file, as a C array
1269 self.data.dump(file, trace)
1270 file.write("#define %s_magic %d\n" % (self.name, self.magic))
1271 file.write("#define %s_size %d\n" % (self.name, self.size))
1272 file.write("#define %s_poly %d\n" % (self.name, self.poly))
1273
1274
1275 # stuff to deal with arrays of unsigned integers
1276
1277 class ESC[4;38;5;81mArray:
1278
1279 def __init__(self, name, data):
1280 self.name = name
1281 self.data = data
1282
1283 def dump(self, file, trace=0):
1284 # write data to file, as a C array
1285 size = getsize(self.data)
1286 if trace:
1287 print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
1288 file.write("static const ")
1289 if size == 1:
1290 file.write("unsigned char")
1291 elif size == 2:
1292 file.write("unsigned short")
1293 else:
1294 file.write("unsigned int")
1295 file.write(" " + self.name + "[] = {\n")
1296 if self.data:
1297 s = " "
1298 for item in self.data:
1299 i = str(item) + ", "
1300 if len(s) + len(i) > 78:
1301 file.write(s.rstrip() + "\n")
1302 s = " " + i
1303 else:
1304 s = s + i
1305 if s.strip():
1306 file.write(s.rstrip() + "\n")
1307 file.write("};\n\n")
1308
1309
1310 def getsize(data):
1311 # return smallest possible integer size for the given array
1312 maxdata = max(data)
1313 if maxdata < 256:
1314 return 1
1315 elif maxdata < 65536:
1316 return 2
1317 else:
1318 return 4
1319
1320
1321 def splitbins(t, trace=0):
1322 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
1323
1324 t is a sequence of ints. This function can be useful to save space if
1325 many of the ints are the same. t1 and t2 are lists of ints, and shift
1326 is an int, chosen to minimize the combined size of t1 and t2 (in C
1327 code), and where for each i in range(len(t)),
1328 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1329 where mask is a bitmask isolating the last "shift" bits.
1330
1331 If optional arg trace is non-zero (default zero), progress info
1332 is printed to sys.stderr. The higher the value, the more info
1333 you'll get.
1334 """
1335
1336 if trace:
1337 def dump(t1, t2, shift, bytes):
1338 print("%d+%d bins at shift %d; %d bytes" % (
1339 len(t1), len(t2), shift, bytes), file=sys.stderr)
1340 print("Size of original table:", len(t)*getsize(t), "bytes",
1341 file=sys.stderr)
1342 n = len(t)-1 # last valid index
1343 maxshift = 0 # the most we can shift n and still have something left
1344 if n > 0:
1345 while n >> 1:
1346 n >>= 1
1347 maxshift += 1
1348 del n
1349 bytes = sys.maxsize # smallest total size so far
1350 t = tuple(t) # so slices can be dict keys
1351 for shift in range(maxshift + 1):
1352 t1 = []
1353 t2 = []
1354 size = 2**shift
1355 bincache = {}
1356 for i in range(0, len(t), size):
1357 bin = t[i:i+size]
1358 index = bincache.get(bin)
1359 if index is None:
1360 index = len(t2)
1361 bincache[bin] = index
1362 t2.extend(bin)
1363 t1.append(index >> shift)
1364 # determine memory size
1365 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
1366 if trace > 1:
1367 dump(t1, t2, shift, b)
1368 if b < bytes:
1369 best = t1, t2, shift
1370 bytes = b
1371 t1, t2, shift = best
1372 if trace:
1373 print("Best:", end=' ', file=sys.stderr)
1374 dump(t1, t2, shift, bytes)
1375 if __debug__:
1376 # exhaustively verify that the decomposition is correct
1377 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
1378 for i in range(len(t)):
1379 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
1380 return best
1381
1382
1383 if __name__ == "__main__":
1384 maketables(1)