1  #
       2  # (re)generate unicode property and type databases
       3  #
       4  # This script converts Unicode database files to Modules/unicodedata_db.h,
       5  # Modules/unicodename_db.h, and Objects/unicodetype_db.h
       6  #
       7  # history:
       8  # 2000-09-24 fl   created (based on bits and pieces from unidb)
       9  # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
      10  # 2000-09-25 fl   added character type table
      11  # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
      12  # 2000-11-03 fl   expand first/last ranges
      13  # 2001-01-19 fl   added character name tables (2.1)
      14  # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
      15  # 2002-09-11 wd   use string methods
      16  # 2002-10-18 mvl  update to Unicode 3.2
      17  # 2002-10-22 mvl  generate NFC tables
      18  # 2002-11-24 mvl  expand all ranges, sort names version-independently
      19  # 2002-11-25 mvl  add UNIDATA_VERSION
      20  # 2004-05-29 perky add east asian width information
      21  # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
      22  # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
      23  # 2011-10-21 ezio add support for name aliases and named sequences
      24  # 2012-01    benjamin add full case mappings
      25  #
      26  # written by Fredrik Lundh (fredrik@pythonware.com)
      27  #
      28  
      29  import dataclasses
      30  import os
      31  import sys
      32  import zipfile
      33  
      34  from functools import partial
      35  from textwrap import dedent
      36  from typing import Iterator, List, Optional, Set, Tuple
      37  
      38  SCRIPT = sys.argv[0]
      39  VERSION = "3.3"
      40  
      41  # The Unicode Database
      42  # --------------------
      43  # When changing UCD version please update
      44  #   * Doc/library/stdtypes.rst, and
      45  #   * Doc/library/unicodedata.rst
      46  #   * Doc/reference/lexical_analysis.rst (two occurrences)
      47  UNIDATA_VERSION = "14.0.0"
      48  UNICODE_DATA = "UnicodeData%s.txt"
      49  COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
      50  EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
      51  UNIHAN = "Unihan%s.zip"
      52  DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
      53  DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
      54  LINE_BREAK = "LineBreak%s.txt"
      55  NAME_ALIASES = "NameAliases%s.txt"
      56  NAMED_SEQUENCES = "NamedSequences%s.txt"
      57  SPECIAL_CASING = "SpecialCasing%s.txt"
      58  CASE_FOLDING = "CaseFolding%s.txt"
      59  
      60  # Private Use Areas -- in planes 1, 15, 16
      61  PUA_1 = range(0xE000, 0xF900)
      62  PUA_15 = range(0xF0000, 0xFFFFE)
      63  PUA_16 = range(0x100000, 0x10FFFE)
      64  
      65  # we use this ranges of PUA_15 to store name aliases and named sequences
      66  NAME_ALIASES_START = 0xF0000
      67  NAMED_SEQUENCES_START = 0xF0200
      68  
      69  old_versions = ["3.2.0"]
      70  
      71  CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
      72      "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
      73      "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
      74      "So" ]
      75  
      76  BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
      77      "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
      78      "ON", "LRI", "RLI", "FSI", "PDI" ]
      79  
      80  EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
      81  
      82  MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
      83  
      84  # note: should match definitions in Objects/unicodectype.c
      85  ALPHA_MASK = 0x01
      86  DECIMAL_MASK = 0x02
      87  DIGIT_MASK = 0x04
      88  LOWER_MASK = 0x08
      89  LINEBREAK_MASK = 0x10
      90  SPACE_MASK = 0x20
      91  TITLE_MASK = 0x40
      92  UPPER_MASK = 0x80
      93  XID_START_MASK = 0x100
      94  XID_CONTINUE_MASK = 0x200
      95  PRINTABLE_MASK = 0x400
      96  NUMERIC_MASK = 0x800
      97  CASE_IGNORABLE_MASK = 0x1000
      98  CASED_MASK = 0x2000
      99  EXTENDED_CASE_MASK = 0x4000
     100  
     101  # these ranges need to match unicodedata.c:is_unified_ideograph
     102  cjk_ranges = [
     103      ('3400', '4DBF'),
     104      ('4E00', '9FFF'),
     105      ('20000', '2A6DF'),
     106      ('2A700', '2B738'),
     107      ('2B740', '2B81D'),
     108      ('2B820', '2CEA1'),
     109      ('2CEB0', '2EBE0'),
     110      ('30000', '3134A'),
     111  ]
     112  
     113  
     114  def maketables(trace=0):
     115  
     116      print("--- Reading", UNICODE_DATA % "", "...")
     117  
     118      unicode = UnicodeData(UNIDATA_VERSION)
     119  
     120      print(len(list(filter(None, unicode.table))), "characters")
     121  
     122      for version in old_versions:
     123          print("--- Reading", UNICODE_DATA % ("-"+version), "...")
     124          old_unicode = UnicodeData(version, cjk_check=False)
     125          print(len(list(filter(None, old_unicode.table))), "characters")
     126          merge_old_version(version, unicode, old_unicode)
     127  
     128      makeunicodename(unicode, trace)
     129      makeunicodedata(unicode, trace)
     130      makeunicodetype(unicode, trace)
     131  
     132  
     133  # --------------------------------------------------------------------
     134  # unicode character properties
     135  
     136  def makeunicodedata(unicode, trace):
     137  
     138      dummy = (0, 0, 0, 0, 0, 0)
     139      table = [dummy]
     140      cache = {0: dummy}
     141      index = [0] * len(unicode.chars)
     142  
     143      FILE = "Modules/unicodedata_db.h"
     144  
     145      print("--- Preparing", FILE, "...")
     146  
     147      # 1) database properties
     148  
     149      for char in unicode.chars:
     150          record = unicode.table[char]
     151          if record:
     152              # extract database properties
     153              category = CATEGORY_NAMES.index(record.general_category)
     154              combining = int(record.canonical_combining_class)
     155              bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
     156              mirrored = record.bidi_mirrored == "Y"
     157              eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
     158              normalizationquickcheck = record.quick_check
     159              item = (
     160                  category, combining, bidirectional, mirrored, eastasianwidth,
     161                  normalizationquickcheck
     162                  )
     163              # add entry to index and item tables
     164              i = cache.get(item)
     165              if i is None:
     166                  cache[item] = i = len(table)
     167                  table.append(item)
     168              index[char] = i
     169  
     170      # 2) decomposition data
     171  
     172      decomp_data = [0]
     173      decomp_prefix = [""]
     174      decomp_index = [0] * len(unicode.chars)
     175      decomp_size = 0
     176  
     177      comp_pairs = []
     178      comp_first = [None] * len(unicode.chars)
     179      comp_last = [None] * len(unicode.chars)
     180  
     181      for char in unicode.chars:
     182          record = unicode.table[char]
     183          if record:
     184              if record.decomposition_type:
     185                  decomp = record.decomposition_type.split()
     186                  if len(decomp) > 19:
     187                      raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
     188                  # prefix
     189                  if decomp[0][0] == "<":
     190                      prefix = decomp.pop(0)
     191                  else:
     192                      prefix = ""
     193                  try:
     194                      i = decomp_prefix.index(prefix)
     195                  except ValueError:
     196                      i = len(decomp_prefix)
     197                      decomp_prefix.append(prefix)
     198                  prefix = i
     199                  assert prefix < 256
     200                  # content
     201                  decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
     202                  # Collect NFC pairs
     203                  if not prefix and len(decomp) == 3 and \
     204                     char not in unicode.exclusions and \
     205                     unicode.table[decomp[1]].canonical_combining_class == "0":
     206                      p, l, r = decomp
     207                      comp_first[l] = 1
     208                      comp_last[r] = 1
     209                      comp_pairs.append((l,r,char))
     210                  try:
     211                      i = decomp_data.index(decomp)
     212                  except ValueError:
     213                      i = len(decomp_data)
     214                      decomp_data.extend(decomp)
     215                      decomp_size = decomp_size + len(decomp) * 2
     216              else:
     217                  i = 0
     218              decomp_index[char] = i
     219  
     220      f = l = 0
     221      comp_first_ranges = []
     222      comp_last_ranges = []
     223      prev_f = prev_l = None
     224      for i in unicode.chars:
     225          if comp_first[i] is not None:
     226              comp_first[i] = f
     227              f += 1
     228              if prev_f is None:
     229                  prev_f = (i,i)
     230              elif prev_f[1]+1 == i:
     231                  prev_f = prev_f[0],i
     232              else:
     233                  comp_first_ranges.append(prev_f)
     234                  prev_f = (i,i)
     235          if comp_last[i] is not None:
     236              comp_last[i] = l
     237              l += 1
     238              if prev_l is None:
     239                  prev_l = (i,i)
     240              elif prev_l[1]+1 == i:
     241                  prev_l = prev_l[0],i
     242              else:
     243                  comp_last_ranges.append(prev_l)
     244                  prev_l = (i,i)
     245      comp_first_ranges.append(prev_f)
     246      comp_last_ranges.append(prev_l)
     247      total_first = f
     248      total_last = l
     249  
     250      comp_data = [0]*(total_first*total_last)
     251      for f,l,char in comp_pairs:
     252          f = comp_first[f]
     253          l = comp_last[l]
     254          comp_data[f*total_last+l] = char
     255  
     256      print(len(table), "unique properties")
     257      print(len(decomp_prefix), "unique decomposition prefixes")
     258      print(len(decomp_data), "unique decomposition entries:", end=' ')
     259      print(decomp_size, "bytes")
     260      print(total_first, "first characters in NFC")
     261      print(total_last, "last characters in NFC")
     262      print(len(comp_pairs), "NFC pairs")
     263  
     264      print("--- Writing", FILE, "...")
     265  
     266      with open(FILE, "w") as fp:
     267          fprint = partial(print, file=fp)
     268  
     269          fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
     270          fprint()
     271          fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
     272          fprint("/* a list of unique database records */")
     273          fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
     274          for item in table:
     275              fprint("    {%d, %d, %d, %d, %d, %d}," % item)
     276          fprint("};")
     277          fprint()
     278  
     279          fprint("/* Reindexing of NFC first characters. */")
     280          fprint("#define TOTAL_FIRST",total_first)
     281          fprint("#define TOTAL_LAST",total_last)
     282          fprint("struct reindex{int start;short count,index;};")
     283          fprint("static struct reindex nfc_first[] = {")
     284          for start,end in comp_first_ranges:
     285              fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
     286          fprint("    {0,0,0}")
     287          fprint("};\n")
     288          fprint("static struct reindex nfc_last[] = {")
     289          for start,end in comp_last_ranges:
     290              fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
     291          fprint("  {0,0,0}")
     292          fprint("};\n")
     293  
     294          # FIXME: <fl> the following tables could be made static, and
     295          # the support code moved into unicodedatabase.c
     296  
     297          fprint("/* string literals */")
     298          fprint("const char *_PyUnicode_CategoryNames[] = {")
     299          for name in CATEGORY_NAMES:
     300              fprint("    \"%s\"," % name)
     301          fprint("    NULL")
     302          fprint("};")
     303  
     304          fprint("const char *_PyUnicode_BidirectionalNames[] = {")
     305          for name in BIDIRECTIONAL_NAMES:
     306              fprint("    \"%s\"," % name)
     307          fprint("    NULL")
     308          fprint("};")
     309  
     310          fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
     311          for name in EASTASIANWIDTH_NAMES:
     312              fprint("    \"%s\"," % name)
     313          fprint("    NULL")
     314          fprint("};")
     315  
     316          fprint("static const char *decomp_prefix[] = {")
     317          for name in decomp_prefix:
     318              fprint("    \"%s\"," % name)
     319          fprint("    NULL")
     320          fprint("};")
     321  
     322          # split record index table
     323          index1, index2, shift = splitbins(index, trace)
     324  
     325          fprint("/* index tables for the database records */")
     326          fprint("#define SHIFT", shift)
     327          Array("index1", index1).dump(fp, trace)
     328          Array("index2", index2).dump(fp, trace)
     329  
     330          # split decomposition index table
     331          index1, index2, shift = splitbins(decomp_index, trace)
     332  
     333          fprint("/* decomposition data */")
     334          Array("decomp_data", decomp_data).dump(fp, trace)
     335  
     336          fprint("/* index tables for the decomposition data */")
     337          fprint("#define DECOMP_SHIFT", shift)
     338          Array("decomp_index1", index1).dump(fp, trace)
     339          Array("decomp_index2", index2).dump(fp, trace)
     340  
     341          index, index2, shift = splitbins(comp_data, trace)
     342          fprint("/* NFC pairs */")
     343          fprint("#define COMP_SHIFT", shift)
     344          Array("comp_index", index).dump(fp, trace)
     345          Array("comp_data", index2).dump(fp, trace)
     346  
     347          # Generate delta tables for old versions
     348          for version, table, normalization in unicode.changed:
     349              cversion = version.replace(".","_")
     350              records = [table[0]]
     351              cache = {table[0]:0}
     352              index = [0] * len(table)
     353              for i, record in enumerate(table):
     354                  try:
     355                      index[i] = cache[record]
     356                  except KeyError:
     357                      index[i] = cache[record] = len(records)
     358                      records.append(record)
     359              index1, index2, shift = splitbins(index, trace)
     360              fprint("static const change_record change_records_%s[] = {" % cversion)
     361              for record in records:
     362                  fprint("    { %s }," % ", ".join(map(str,record)))
     363              fprint("};")
     364              Array("changes_%s_index" % cversion, index1).dump(fp, trace)
     365              Array("changes_%s_data" % cversion, index2).dump(fp, trace)
     366              fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
     367              fprint("{")
     368              fprint("    int index;")
     369              fprint("    if (n >= 0x110000) index = 0;")
     370              fprint("    else {")
     371              fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
     372              fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
     373                     (cversion, shift, ((1<<shift)-1)))
     374              fprint("    }")
     375              fprint("    return change_records_%s+index;" % cversion)
     376              fprint("}\n")
     377              fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
     378              fprint("{")
     379              fprint("    switch(n) {")
     380              for k, v in normalization:
     381                  fprint("    case %s: return 0x%s;" % (hex(k), v))
     382              fprint("    default: return 0;")
     383              fprint("    }\n}\n")
     384  
     385  
     386  # --------------------------------------------------------------------
     387  # unicode character type tables
     388  
     389  def makeunicodetype(unicode, trace):
     390  
     391      FILE = "Objects/unicodetype_db.h"
     392  
     393      print("--- Preparing", FILE, "...")
     394  
     395      # extract unicode types
     396      dummy = (0, 0, 0, 0, 0, 0)
     397      table = [dummy]
     398      cache = {0: dummy}
     399      index = [0] * len(unicode.chars)
     400      numeric = {}
     401      spaces = []
     402      linebreaks = []
     403      extra_casing = []
     404  
     405      for char in unicode.chars:
     406          record = unicode.table[char]
     407          if record:
     408              # extract database properties
     409              category = record.general_category
     410              bidirectional = record.bidi_class
     411              properties = record.binary_properties
     412              flags = 0
     413              if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
     414                  flags |= ALPHA_MASK
     415              if "Lowercase" in properties:
     416                  flags |= LOWER_MASK
     417              if 'Line_Break' in properties or bidirectional == "B":
     418                  flags |= LINEBREAK_MASK
     419                  linebreaks.append(char)
     420              if category == "Zs" or bidirectional in ("WS", "B", "S"):
     421                  flags |= SPACE_MASK
     422                  spaces.append(char)
     423              if category == "Lt":
     424                  flags |= TITLE_MASK
     425              if "Uppercase" in properties:
     426                  flags |= UPPER_MASK
     427              if char == ord(" ") or category[0] not in ("C", "Z"):
     428                  flags |= PRINTABLE_MASK
     429              if "XID_Start" in properties:
     430                  flags |= XID_START_MASK
     431              if "XID_Continue" in properties:
     432                  flags |= XID_CONTINUE_MASK
     433              if "Cased" in properties:
     434                  flags |= CASED_MASK
     435              if "Case_Ignorable" in properties:
     436                  flags |= CASE_IGNORABLE_MASK
     437              sc = unicode.special_casing.get(char)
     438              cf = unicode.case_folding.get(char, [char])
     439              if record.simple_uppercase_mapping:
     440                  upper = int(record.simple_uppercase_mapping, 16)
     441              else:
     442                  upper = char
     443              if record.simple_lowercase_mapping:
     444                  lower = int(record.simple_lowercase_mapping, 16)
     445              else:
     446                  lower = char
     447              if record.simple_titlecase_mapping:
     448                  title = int(record.simple_titlecase_mapping, 16)
     449              else:
     450                  title = upper
     451              if sc is None and cf != [lower]:
     452                  sc = ([lower], [title], [upper])
     453              if sc is None:
     454                  if upper == lower == title:
     455                      upper = lower = title = 0
     456                  else:
     457                      upper = upper - char
     458                      lower = lower - char
     459                      title = title - char
     460                      assert (abs(upper) <= 2147483647 and
     461                              abs(lower) <= 2147483647 and
     462                              abs(title) <= 2147483647)
     463              else:
     464                  # This happens either when some character maps to more than one
     465                  # character in uppercase, lowercase, or titlecase or the
     466                  # casefolded version of the character is different from the
     467                  # lowercase. The extra characters are stored in a different
     468                  # array.
     469                  flags |= EXTENDED_CASE_MASK
     470                  lower = len(extra_casing) | (len(sc[0]) << 24)
     471                  extra_casing.extend(sc[0])
     472                  if cf != sc[0]:
     473                      lower |= len(cf) << 20
     474                      extra_casing.extend(cf)
     475                  upper = len(extra_casing) | (len(sc[2]) << 24)
     476                  extra_casing.extend(sc[2])
     477                  # Title is probably equal to upper.
     478                  if sc[1] == sc[2]:
     479                      title = upper
     480                  else:
     481                      title = len(extra_casing) | (len(sc[1]) << 24)
     482                      extra_casing.extend(sc[1])
     483              # decimal digit, integer digit
     484              decimal = 0
     485              if record.decomposition_mapping:
     486                  flags |= DECIMAL_MASK
     487                  decimal = int(record.decomposition_mapping)
     488              digit = 0
     489              if record.numeric_type:
     490                  flags |= DIGIT_MASK
     491                  digit = int(record.numeric_type)
     492              if record.numeric_value:
     493                  flags |= NUMERIC_MASK
     494                  numeric.setdefault(record.numeric_value, []).append(char)
     495              item = (
     496                  upper, lower, title, decimal, digit, flags
     497                  )
     498              # add entry to index and item tables
     499              i = cache.get(item)
     500              if i is None:
     501                  cache[item] = i = len(table)
     502                  table.append(item)
     503              index[char] = i
     504  
     505      print(len(table), "unique character type entries")
     506      print(sum(map(len, numeric.values())), "numeric code points")
     507      print(len(spaces), "whitespace code points")
     508      print(len(linebreaks), "linebreak code points")
     509      print(len(extra_casing), "extended case array")
     510  
     511      print("--- Writing", FILE, "...")
     512  
     513      with open(FILE, "w") as fp:
     514          fprint = partial(print, file=fp)
     515  
     516          fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
     517          fprint()
     518          fprint("/* a list of unique character type descriptors */")
     519          fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
     520          for item in table:
     521              fprint("    {%d, %d, %d, %d, %d, %d}," % item)
     522          fprint("};")
     523          fprint()
     524  
     525          fprint("/* extended case mappings */")
     526          fprint()
     527          fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
     528          for c in extra_casing:
     529              fprint("    %d," % c)
     530          fprint("};")
     531          fprint()
     532  
     533          # split decomposition index table
     534          index1, index2, shift = splitbins(index, trace)
     535  
     536          fprint("/* type indexes */")
     537          fprint("#define SHIFT", shift)
     538          Array("index1", index1).dump(fp, trace)
     539          Array("index2", index2).dump(fp, trace)
     540  
     541          # Generate code for _PyUnicode_ToNumeric()
     542          numeric_items = sorted(numeric.items())
     543          fprint('/* Returns the numeric value as double for Unicode characters')
     544          fprint(' * having this property, -1.0 otherwise.')
     545          fprint(' */')
     546          fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
     547          fprint('{')
     548          fprint('    switch (ch) {')
     549          for value, codepoints in numeric_items:
     550              # Turn text into float literals
     551              parts = value.split('/')
     552              parts = [repr(float(part)) for part in parts]
     553              value = '/'.join(parts)
     554  
     555              codepoints.sort()
     556              for codepoint in codepoints:
     557                  fprint('    case 0x%04X:' % (codepoint,))
     558              fprint('        return (double) %s;' % (value,))
     559          fprint('    }')
     560          fprint('    return -1.0;')
     561          fprint('}')
     562          fprint()
     563  
     564          # Generate code for _PyUnicode_IsWhitespace()
     565          fprint("/* Returns 1 for Unicode characters having the bidirectional")
     566          fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
     567          fprint(" */")
     568          fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
     569          fprint('{')
     570          fprint('    switch (ch) {')
     571  
     572          for codepoint in sorted(spaces):
     573              fprint('    case 0x%04X:' % (codepoint,))
     574          fprint('        return 1;')
     575  
     576          fprint('    }')
     577          fprint('    return 0;')
     578          fprint('}')
     579          fprint()
     580  
     581          # Generate code for _PyUnicode_IsLinebreak()
     582          fprint("/* Returns 1 for Unicode characters having the line break")
     583          fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
     584          fprint(" * type 'B', 0 otherwise.")
     585          fprint(" */")
     586          fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
     587          fprint('{')
     588          fprint('    switch (ch) {')
     589          for codepoint in sorted(linebreaks):
     590              fprint('    case 0x%04X:' % (codepoint,))
     591          fprint('        return 1;')
     592  
     593          fprint('    }')
     594          fprint('    return 0;')
     595          fprint('}')
     596          fprint()
     597  
     598  
     599  # --------------------------------------------------------------------
     600  # unicode name database
     601  
     602  def makeunicodename(unicode, trace):
     603  
     604      FILE = "Modules/unicodename_db.h"
     605  
     606      print("--- Preparing", FILE, "...")
     607  
     608      # collect names
     609      names = [None] * len(unicode.chars)
     610  
     611      for char in unicode.chars:
     612          record = unicode.table[char]
     613          if record:
     614              name = record.name.strip()
     615              if name and name[0] != "<":
     616                  names[char] = name + chr(0)
     617  
     618      print(len([n for n in names if n is not None]), "distinct names")
     619  
     620      # collect unique words from names (note that we differ between
     621      # words inside a sentence, and words ending a sentence.  the
     622      # latter includes the trailing null byte.
     623  
     624      words = {}
     625      n = b = 0
     626      for char in unicode.chars:
     627          name = names[char]
     628          if name:
     629              w = name.split()
     630              b = b + len(name)
     631              n = n + len(w)
     632              for w in w:
     633                  l = words.get(w)
     634                  if l:
     635                      l.append(None)
     636                  else:
     637                      words[w] = [len(words)]
     638  
     639      print(n, "words in text;", b, "bytes")
     640  
     641      wordlist = list(words.items())
     642  
     643      # sort on falling frequency, then by name
     644      def word_key(a):
     645          aword, alist = a
     646          return -len(alist), aword
     647      wordlist.sort(key=word_key)
     648  
     649      # figure out how many phrasebook escapes we need
     650      escapes = 0
     651      while escapes * 256 < len(wordlist):
     652          escapes = escapes + 1
     653      print(escapes, "escapes")
     654  
     655      short = 256 - escapes
     656  
     657      assert short > 0
     658  
     659      print(short, "short indexes in lexicon")
     660  
     661      # statistics
     662      n = 0
     663      for i in range(short):
     664          n = n + len(wordlist[i][1])
     665      print(n, "short indexes in phrasebook")
     666  
     667      # pick the most commonly used words, and sort the rest on falling
     668      # length (to maximize overlap)
     669  
     670      wordlist, wordtail = wordlist[:short], wordlist[short:]
     671      wordtail.sort(key=lambda a: a[0], reverse=True)
     672      wordlist.extend(wordtail)
     673  
     674      # generate lexicon from words
     675  
     676      lexicon_offset = [0]
     677      lexicon = ""
     678      words = {}
     679  
     680      # build a lexicon string
     681      offset = 0
     682      for w, x in wordlist:
     683          # encoding: bit 7 indicates last character in word (chr(128)
     684          # indicates the last character in an entire string)
     685          ww = w[:-1] + chr(ord(w[-1])+128)
     686          # reuse string tails, when possible
     687          o = lexicon.find(ww)
     688          if o < 0:
     689              o = offset
     690              lexicon = lexicon + ww
     691              offset = offset + len(w)
     692          words[w] = len(lexicon_offset)
     693          lexicon_offset.append(o)
     694  
     695      lexicon = list(map(ord, lexicon))
     696  
     697      # generate phrasebook from names and lexicon
     698      phrasebook = [0]
     699      phrasebook_offset = [0] * len(unicode.chars)
     700      for char in unicode.chars:
     701          name = names[char]
     702          if name:
     703              w = name.split()
     704              phrasebook_offset[char] = len(phrasebook)
     705              for w in w:
     706                  i = words[w]
     707                  if i < short:
     708                      phrasebook.append(i)
     709                  else:
     710                      # store as two bytes
     711                      phrasebook.append((i>>8) + short)
     712                      phrasebook.append(i&255)
     713  
     714      assert getsize(phrasebook) == 1
     715  
     716      #
     717      # unicode name hash table
     718  
     719      # extract names
     720      data = []
     721      for char in unicode.chars:
     722          record = unicode.table[char]
     723          if record:
     724              name = record.name.strip()
     725              if name and name[0] != "<":
     726                  data.append((name, char))
     727  
     728      # the magic number 47 was chosen to minimize the number of
     729      # collisions on the current data set.  if you like, change it
     730      # and see what happens...
     731  
     732      codehash = Hash("code", data, 47)
     733  
     734      print("--- Writing", FILE, "...")
     735  
     736      with open(FILE, "w") as fp:
     737          fprint = partial(print, file=fp)
     738  
     739          fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
     740          fprint()
     741          fprint("#define NAME_MAXLEN", 256)
     742          fprint()
     743          fprint("/* lexicon */")
     744          Array("lexicon", lexicon).dump(fp, trace)
     745          Array("lexicon_offset", lexicon_offset).dump(fp, trace)
     746  
     747          # split decomposition index table
     748          offset1, offset2, shift = splitbins(phrasebook_offset, trace)
     749  
     750          fprint("/* code->name phrasebook */")
     751          fprint("#define phrasebook_shift", shift)
     752          fprint("#define phrasebook_short", short)
     753  
     754          Array("phrasebook", phrasebook).dump(fp, trace)
     755          Array("phrasebook_offset1", offset1).dump(fp, trace)
     756          Array("phrasebook_offset2", offset2).dump(fp, trace)
     757  
     758          fprint("/* name->code dictionary */")
     759          codehash.dump(fp, trace)
     760  
     761          fprint()
     762          fprint('static const unsigned int aliases_start = %#x;' %
     763                 NAME_ALIASES_START)
     764          fprint('static const unsigned int aliases_end = %#x;' %
     765                 (NAME_ALIASES_START + len(unicode.aliases)))
     766  
     767          fprint('static const unsigned int name_aliases[] = {')
     768          for name, codepoint in unicode.aliases:
     769              fprint('    0x%04X,' % codepoint)
     770          fprint('};')
     771  
     772          # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
     773          # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
     774          # sequences or sequences with non-BMP chars are added.
     775          # unicodedata_lookup should be adapted too.
     776          fprint(dedent("""
     777              typedef struct NamedSequence {
     778                  int seqlen;
     779                  Py_UCS2 seq[4];
     780              } named_sequence;
     781              """))
     782  
     783          fprint('static const unsigned int named_sequences_start = %#x;' %
     784                 NAMED_SEQUENCES_START)
     785          fprint('static const unsigned int named_sequences_end = %#x;' %
     786                 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
     787  
     788          fprint('static const named_sequence named_sequences[] = {')
     789          for name, sequence in unicode.named_sequences:
     790              seq_str = ', '.join('0x%04X' % cp for cp in sequence)
     791              fprint('    {%d, {%s}},' % (len(sequence), seq_str))
     792          fprint('};')
     793  
     794  
     795  def merge_old_version(version, new, old):
     796      # Changes to exclusion file not implemented yet
     797      if old.exclusions != new.exclusions:
     798          raise NotImplementedError("exclusions differ")
     799  
     800      # In these change records, 0xFF means "no change"
     801      bidir_changes = [0xFF]*0x110000
     802      category_changes = [0xFF]*0x110000
     803      decimal_changes = [0xFF]*0x110000
     804      mirrored_changes = [0xFF]*0x110000
     805      east_asian_width_changes = [0xFF]*0x110000
     806      # In numeric data, 0 means "no change",
     807      # -1 means "did not have a numeric value
     808      numeric_changes = [0] * 0x110000
     809      # normalization_changes is a list of key-value pairs
     810      normalization_changes = []
     811      for i in range(0x110000):
     812          if new.table[i] is None:
     813              # Characters unassigned in the new version ought to
     814              # be unassigned in the old one
     815              assert old.table[i] is None
     816              continue
     817          # check characters unassigned in the old version
     818          if old.table[i] is None:
     819              # category 0 is "unassigned"
     820              category_changes[i] = 0
     821              continue
     822          # check characters that differ
     823          if old.table[i] != new.table[i]:
     824              for k, field in enumerate(dataclasses.fields(UcdRecord)):
     825                  value = getattr(old.table[i], field.name)
     826                  new_value = getattr(new.table[i], field.name)
     827                  if value != new_value:
     828                      if k == 1 and i in PUA_15:
     829                          # the name is not set in the old.table, but in the
     830                          # new.table we are using it for aliases and named seq
     831                          assert value == ''
     832                      elif k == 2:
     833                          category_changes[i] = CATEGORY_NAMES.index(value)
     834                      elif k == 4:
     835                          bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
     836                      elif k == 5:
     837                          # We assume that all normalization changes are in 1:1 mappings
     838                          assert " " not in value
     839                          normalization_changes.append((i, value))
     840                      elif k == 6:
     841                          # we only support changes where the old value is a single digit
     842                          assert value in "0123456789"
     843                          decimal_changes[i] = int(value)
     844                      elif k == 8:
     845                          # Since 0 encodes "no change", the old value is better not 0
     846                          if not value:
     847                              numeric_changes[i] = -1
     848                          else:
     849                              numeric_changes[i] = float(value)
     850                              assert numeric_changes[i] not in (0, -1)
     851                      elif k == 9:
     852                          if value == 'Y':
     853                              mirrored_changes[i] = '1'
     854                          else:
     855                              mirrored_changes[i] = '0'
     856                      elif k == 11:
     857                          # change to ISO comment, ignore
     858                          pass
     859                      elif k == 12:
     860                          # change to simple uppercase mapping; ignore
     861                          pass
     862                      elif k == 13:
     863                          # change to simple lowercase mapping; ignore
     864                          pass
     865                      elif k == 14:
     866                          # change to simple titlecase mapping; ignore
     867                          pass
     868                      elif k == 15:
     869                          # change to east asian width
     870                          east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
     871                      elif k == 16:
     872                          # derived property changes; not yet
     873                          pass
     874                      elif k == 17:
     875                          # normalization quickchecks are not performed
     876                          # for older versions
     877                          pass
     878                      else:
     879                          class ESC[4;38;5;81mDifference(ESC[4;38;5;149mException):pass
     880                          raise Difference(hex(i), k, old.table[i], new.table[i])
     881      new.changed.append((version, list(zip(bidir_changes, category_changes,
     882                                            decimal_changes, mirrored_changes,
     883                                            east_asian_width_changes,
     884                                            numeric_changes)),
     885                          normalization_changes))
     886  
     887  
     888  DATA_DIR = os.path.join('Tools', 'unicode', 'data')
     889  
     890  def open_data(template, version):
     891      local = os.path.join(DATA_DIR, template % ('-'+version,))
     892      if not os.path.exists(local):
     893          import urllib.request
     894          if version == '3.2.0':
     895              # irregular url structure
     896              url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
     897          else:
     898              url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
     899          os.makedirs(DATA_DIR, exist_ok=True)
     900          urllib.request.urlretrieve(url, filename=local)
     901      if local.endswith('.txt'):
     902          return open(local, encoding='utf-8')
     903      else:
     904          # Unihan.zip
     905          return open(local, 'rb')
     906  
     907  
     908  def expand_range(char_range: str) -> Iterator[int]:
     909      '''
     910      Parses ranges of code points, as described in UAX #44:
     911        https://www.unicode.org/reports/tr44/#Code_Point_Ranges
     912      '''
     913      if '..' in char_range:
     914          first, last = [int(c, 16) for c in char_range.split('..')]
     915      else:
     916          first = last = int(char_range, 16)
     917      for char in range(first, last+1):
     918          yield char
     919  
     920  
     921  class ESC[4;38;5;81mUcdFile:
     922      '''
     923      A file in the standard format of the UCD.
     924  
     925      See: https://www.unicode.org/reports/tr44/#Format_Conventions
     926  
     927      Note that, as described there, the Unihan data files have their
     928      own separate format.
     929      '''
     930  
     931      def __init__(self, template: str, version: str) -> None:
     932          self.template = template
     933          self.version = version
     934  
     935      def records(self) -> Iterator[List[str]]:
     936          with open_data(self.template, self.version) as file:
     937              for line in file:
     938                  line = line.split('#', 1)[0].strip()
     939                  if not line:
     940                      continue
     941                  yield [field.strip() for field in line.split(';')]
     942  
     943      def __iter__(self) -> Iterator[List[str]]:
     944          return self.records()
     945  
     946      def expanded(self) -> Iterator[Tuple[int, List[str]]]:
     947          for record in self.records():
     948              char_range, rest = record[0], record[1:]
     949              for char in expand_range(char_range):
     950                  yield char, rest
     951  
     952  
     953  @dataclasses.dataclass
     954  class ESC[4;38;5;81mUcdRecord:
     955      # 15 fields from UnicodeData.txt .  See:
     956      #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
     957      codepoint: str
     958      name: str
     959      general_category: str
     960      canonical_combining_class: str
     961      bidi_class: str
     962      decomposition_type: str
     963      decomposition_mapping: str
     964      numeric_type: str
     965      numeric_value: str
     966      bidi_mirrored: str
     967      unicode_1_name: str  # obsolete
     968      iso_comment: str  # obsolete
     969      simple_uppercase_mapping: str
     970      simple_lowercase_mapping: str
     971      simple_titlecase_mapping: str
     972  
     973      # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
     974      east_asian_width: Optional[str]
     975  
     976      # Binary properties, as a set of those that are true.
     977      # Taken from multiple files:
     978      #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
     979      #   https://www.unicode.org/reports/tr44/#LineBreak.txt
     980      binary_properties: Set[str]
     981  
     982      # The Quick_Check properties related to normalization:
     983      #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
     984      # We store them as a bitmask.
     985      quick_check: int
     986  
     987  
     988  def from_row(row: List[str]) -> UcdRecord:
     989      return UcdRecord(*row, None, set(), 0)
     990  
     991  
     992  # --------------------------------------------------------------------
     993  # the following support code is taken from the unidb utilities
     994  # Copyright (c) 1999-2000 by Secret Labs AB
     995  
     996  # load a unicode-data file from disk
     997  
     998  class ESC[4;38;5;81mUnicodeData:
     999      # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
    1000  
    1001      def __init__(self, version, cjk_check=True):
    1002          self.changed = []
    1003          table = [None] * 0x110000
    1004          for s in UcdFile(UNICODE_DATA, version):
    1005              char = int(s[0], 16)
    1006              table[char] = from_row(s)
    1007  
    1008          cjk_ranges_found = []
    1009  
    1010          # expand first-last ranges
    1011          field = None
    1012          for i in range(0, 0x110000):
    1013              # The file UnicodeData.txt has its own distinct way of
    1014              # expressing ranges.  See:
    1015              #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges
    1016              s = table[i]
    1017              if s:
    1018                  if s.name[-6:] == "First>":
    1019                      s.name = ""
    1020                      field = dataclasses.astuple(s)[:15]
    1021                  elif s.name[-5:] == "Last>":
    1022                      if s.name.startswith("<CJK Ideograph"):
    1023                          cjk_ranges_found.append((field[0],
    1024                                                   s.codepoint))
    1025                      s.name = ""
    1026                      field = None
    1027              elif field:
    1028                  table[i] = from_row(('%X' % i,) + field[1:])
    1029          if cjk_check and cjk_ranges != cjk_ranges_found:
    1030              raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
    1031  
    1032          # public attributes
    1033          self.filename = UNICODE_DATA % ''
    1034          self.table = table
    1035          self.chars = list(range(0x110000)) # unicode 3.2
    1036  
    1037          # check for name aliases and named sequences, see #12753
    1038          # aliases and named sequences are not in 3.2.0
    1039          if version != '3.2.0':
    1040              self.aliases = []
    1041              # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
    1042              # in order to take advantage of the compression and lookup
    1043              # algorithms used for the other characters
    1044              pua_index = NAME_ALIASES_START
    1045              for char, name, abbrev in UcdFile(NAME_ALIASES, version):
    1046                  char = int(char, 16)
    1047                  self.aliases.append((name, char))
    1048                  # also store the name in the PUA 1
    1049                  self.table[pua_index].name = name
    1050                  pua_index += 1
    1051              assert pua_index - NAME_ALIASES_START == len(self.aliases)
    1052  
    1053              self.named_sequences = []
    1054              # store named sequences in the PUA 1, in range U+F0100..,
    1055              # in order to take advantage of the compression and lookup
    1056              # algorithms used for the other characters.
    1057  
    1058              assert pua_index < NAMED_SEQUENCES_START
    1059              pua_index = NAMED_SEQUENCES_START
    1060              for name, chars in UcdFile(NAMED_SEQUENCES, version):
    1061                  chars = tuple(int(char, 16) for char in chars.split())
    1062                  # check that the structure defined in makeunicodename is OK
    1063                  assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
    1064                  assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
    1065                      "the NamedSequence struct and in unicodedata_lookup")
    1066                  self.named_sequences.append((name, chars))
    1067                  # also store these in the PUA 1
    1068                  self.table[pua_index].name = name
    1069                  pua_index += 1
    1070              assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
    1071  
    1072          self.exclusions = {}
    1073          for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
    1074              char = int(char, 16)
    1075              self.exclusions[char] = 1
    1076  
    1077          widths = [None] * 0x110000
    1078          for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
    1079              widths[char] = width
    1080  
    1081          for i in range(0, 0x110000):
    1082              if table[i] is not None:
    1083                  table[i].east_asian_width = widths[i]
    1084  
    1085          for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
    1086              if table[char]:
    1087                  # Some properties (e.g. Default_Ignorable_Code_Point)
    1088                  # apply to unassigned code points; ignore them
    1089                  table[char].binary_properties.add(p)
    1090  
    1091          for char_range, value in UcdFile(LINE_BREAK, version):
    1092              if value not in MANDATORY_LINE_BREAKS:
    1093                  continue
    1094              for char in expand_range(char_range):
    1095                  table[char].binary_properties.add('Line_Break')
    1096  
    1097          # We only want the quickcheck properties
    1098          # Format: NF?_QC; Y(es)/N(o)/M(aybe)
    1099          # Yes is the default, hence only N and M occur
    1100          # In 3.2.0, the format was different (NF?_NO)
    1101          # The parsing will incorrectly determine these as
    1102          # "yes", however, unicodedata.c will not perform quickchecks
    1103          # for older versions, and no delta records will be created.
    1104          quickchecks = [0] * 0x110000
    1105          qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
    1106          for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
    1107              if len(s) < 2 or s[1] not in qc_order:
    1108                  continue
    1109              quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
    1110              quickcheck_shift = qc_order.index(s[1])*2
    1111              quickcheck <<= quickcheck_shift
    1112              for char in expand_range(s[0]):
    1113                  assert not (quickchecks[char]>>quickcheck_shift)&3
    1114                  quickchecks[char] |= quickcheck
    1115          for i in range(0, 0x110000):
    1116              if table[i] is not None:
    1117                  table[i].quick_check = quickchecks[i]
    1118  
    1119          with open_data(UNIHAN, version) as file:
    1120              zip = zipfile.ZipFile(file)
    1121              if version == '3.2.0':
    1122                  data = zip.open('Unihan-3.2.0.txt').read()
    1123              else:
    1124                  data = zip.open('Unihan_NumericValues.txt').read()
    1125          for line in data.decode("utf-8").splitlines():
    1126              if not line.startswith('U+'):
    1127                  continue
    1128              code, tag, value = line.split(None, 3)[:3]
    1129              if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
    1130                             'kOtherNumeric'):
    1131                  continue
    1132              value = value.strip().replace(',', '')
    1133              i = int(code[2:], 16)
    1134              # Patch the numeric field
    1135              if table[i] is not None:
    1136                  table[i].numeric_value = value
    1137  
    1138          sc = self.special_casing = {}
    1139          for data in UcdFile(SPECIAL_CASING, version):
    1140              if data[4]:
    1141                  # We ignore all conditionals (since they depend on
    1142                  # languages) except for one, which is hardcoded. See
    1143                  # handle_capital_sigma in unicodeobject.c.
    1144                  continue
    1145              c = int(data[0], 16)
    1146              lower = [int(char, 16) for char in data[1].split()]
    1147              title = [int(char, 16) for char in data[2].split()]
    1148              upper = [int(char, 16) for char in data[3].split()]
    1149              sc[c] = (lower, title, upper)
    1150  
    1151          cf = self.case_folding = {}
    1152          if version != '3.2.0':
    1153              for data in UcdFile(CASE_FOLDING, version):
    1154                  if data[1] in "CF":
    1155                      c = int(data[0], 16)
    1156                      cf[c] = [int(char, 16) for char in data[2].split()]
    1157  
    1158      def uselatin1(self):
    1159          # restrict character range to ISO Latin 1
    1160          self.chars = list(range(256))
    1161  
    1162  
    1163  # hash table tools
    1164  
    1165  # this is a straight-forward reimplementation of Python's built-in
    1166  # dictionary type, using a static data structure, and a custom string
    1167  # hash algorithm.
    1168  
    1169  def myhash(s, magic):
    1170      h = 0
    1171      for c in map(ord, s.upper()):
    1172          h = (h * magic) + c
    1173          ix = h & 0xff000000
    1174          if ix:
    1175              h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
    1176      return h
    1177  
    1178  
    1179  SIZES = [
    1180      (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
    1181      (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
    1182      (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
    1183      (2097152,5), (4194304,3), (8388608,33), (16777216,27)
    1184  ]
    1185  
    1186  
    1187  class ESC[4;38;5;81mHash:
    1188      def __init__(self, name, data, magic):
    1189          # turn a (key, value) list into a static hash table structure
    1190  
    1191          # determine table size
    1192          for size, poly in SIZES:
    1193              if size > len(data):
    1194                  poly = size + poly
    1195                  break
    1196          else:
    1197              raise AssertionError("ran out of polynomials")
    1198  
    1199          print(size, "slots in hash table")
    1200  
    1201          table = [None] * size
    1202  
    1203          mask = size-1
    1204  
    1205          n = 0
    1206  
    1207          hash = myhash
    1208  
    1209          # initialize hash table
    1210          for key, value in data:
    1211              h = hash(key, magic)
    1212              i = (~h) & mask
    1213              v = table[i]
    1214              if v is None:
    1215                  table[i] = value
    1216                  continue
    1217              incr = (h ^ (h >> 3)) & mask
    1218              if not incr:
    1219                  incr = mask
    1220              while 1:
    1221                  n = n + 1
    1222                  i = (i + incr) & mask
    1223                  v = table[i]
    1224                  if v is None:
    1225                      table[i] = value
    1226                      break
    1227                  incr = incr << 1
    1228                  if incr > mask:
    1229                      incr = incr ^ poly
    1230  
    1231          print(n, "collisions")
    1232          self.collisions = n
    1233  
    1234          for i in range(len(table)):
    1235              if table[i] is None:
    1236                  table[i] = 0
    1237  
    1238          self.data = Array(name + "_hash", table)
    1239          self.magic = magic
    1240          self.name = name
    1241          self.size = size
    1242          self.poly = poly
    1243  
    1244      def dump(self, file, trace):
    1245          # write data to file, as a C array
    1246          self.data.dump(file, trace)
    1247          file.write("#define %s_magic %d\n" % (self.name, self.magic))
    1248          file.write("#define %s_size %d\n" % (self.name, self.size))
    1249          file.write("#define %s_poly %d\n" % (self.name, self.poly))
    1250  
    1251  
    1252  # stuff to deal with arrays of unsigned integers
    1253  
    1254  class ESC[4;38;5;81mArray:
    1255  
    1256      def __init__(self, name, data):
    1257          self.name = name
    1258          self.data = data
    1259  
    1260      def dump(self, file, trace=0):
    1261          # write data to file, as a C array
    1262          size = getsize(self.data)
    1263          if trace:
    1264              print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
    1265          file.write("static const ")
    1266          if size == 1:
    1267              file.write("unsigned char")
    1268          elif size == 2:
    1269              file.write("unsigned short")
    1270          else:
    1271              file.write("unsigned int")
    1272          file.write(" " + self.name + "[] = {\n")
    1273          if self.data:
    1274              s = "    "
    1275              for item in self.data:
    1276                  i = str(item) + ", "
    1277                  if len(s) + len(i) > 78:
    1278                      file.write(s.rstrip() + "\n")
    1279                      s = "    " + i
    1280                  else:
    1281                      s = s + i
    1282              if s.strip():
    1283                  file.write(s.rstrip() + "\n")
    1284          file.write("};\n\n")
    1285  
    1286  
    1287  def getsize(data):
    1288      # return smallest possible integer size for the given array
    1289      maxdata = max(data)
    1290      if maxdata < 256:
    1291          return 1
    1292      elif maxdata < 65536:
    1293          return 2
    1294      else:
    1295          return 4
    1296  
    1297  
    1298  def splitbins(t, trace=0):
    1299      """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
    1300  
    1301      t is a sequence of ints.  This function can be useful to save space if
    1302      many of the ints are the same.  t1 and t2 are lists of ints, and shift
    1303      is an int, chosen to minimize the combined size of t1 and t2 (in C
    1304      code), and where for each i in range(len(t)),
    1305          t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
    1306      where mask is a bitmask isolating the last "shift" bits.
    1307  
    1308      If optional arg trace is non-zero (default zero), progress info
    1309      is printed to sys.stderr.  The higher the value, the more info
    1310      you'll get.
    1311      """
    1312  
    1313      if trace:
    1314          def dump(t1, t2, shift, bytes):
    1315              print("%d+%d bins at shift %d; %d bytes" % (
    1316                  len(t1), len(t2), shift, bytes), file=sys.stderr)
    1317          print("Size of original table:", len(t)*getsize(t), "bytes",
    1318                file=sys.stderr)
    1319      n = len(t)-1    # last valid index
    1320      maxshift = 0    # the most we can shift n and still have something left
    1321      if n > 0:
    1322          while n >> 1:
    1323              n >>= 1
    1324              maxshift += 1
    1325      del n
    1326      bytes = sys.maxsize  # smallest total size so far
    1327      t = tuple(t)    # so slices can be dict keys
    1328      for shift in range(maxshift + 1):
    1329          t1 = []
    1330          t2 = []
    1331          size = 2**shift
    1332          bincache = {}
    1333          for i in range(0, len(t), size):
    1334              bin = t[i:i+size]
    1335              index = bincache.get(bin)
    1336              if index is None:
    1337                  index = len(t2)
    1338                  bincache[bin] = index
    1339                  t2.extend(bin)
    1340              t1.append(index >> shift)
    1341          # determine memory size
    1342          b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
    1343          if trace > 1:
    1344              dump(t1, t2, shift, b)
    1345          if b < bytes:
    1346              best = t1, t2, shift
    1347              bytes = b
    1348      t1, t2, shift = best
    1349      if trace:
    1350          print("Best:", end=' ', file=sys.stderr)
    1351          dump(t1, t2, shift, bytes)
    1352      if __debug__:
    1353          # exhaustively verify that the decomposition is correct
    1354          mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
    1355          for i in range(len(t)):
    1356              assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
    1357      return best
    1358  
    1359  
    1360  if __name__ == "__main__":
    1361      maketables(1)