1  #
       2  # (re)generate unicode property and type databases
       3  #
       4  # This script converts Unicode database files to Modules/unicodedata_db.h,
       5  # Modules/unicodename_db.h, and Objects/unicodetype_db.h
       6  #
       7  # history:
       8  # 2000-09-24 fl   created (based on bits and pieces from unidb)
       9  # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
      10  # 2000-09-25 fl   added character type table
      11  # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
      12  # 2000-11-03 fl   expand first/last ranges
      13  # 2001-01-19 fl   added character name tables (2.1)
      14  # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
      15  # 2002-09-11 wd   use string methods
      16  # 2002-10-18 mvl  update to Unicode 3.2
      17  # 2002-10-22 mvl  generate NFC tables
      18  # 2002-11-24 mvl  expand all ranges, sort names version-independently
      19  # 2002-11-25 mvl  add UNIDATA_VERSION
      20  # 2004-05-29 perky add east asian width information
      21  # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
      22  # 2008-06-11 gb   add PRINTABLE_MASK for Atsuo Ishimoto's ascii() patch
      23  # 2011-10-21 ezio add support for name aliases and named sequences
      24  # 2012-01    benjamin add full case mappings
      25  #
      26  # written by Fredrik Lundh (fredrik@pythonware.com)
      27  #
      28  
      29  import dataclasses
      30  import os
      31  import sys
      32  import zipfile
      33  
      34  from functools import partial
      35  from textwrap import dedent
      36  from typing import Iterator, List, Optional, Set, Tuple
      37  
      38  SCRIPT = sys.argv[0]
      39  VERSION = "3.3"
      40  
      41  # The Unicode Database
      42  # --------------------
      43  # When changing UCD version please update
      44  #   * Doc/library/stdtypes.rst, and
      45  #   * Doc/library/unicodedata.rst
      46  #   * Doc/reference/lexical_analysis.rst (two occurrences)
      47  UNIDATA_VERSION = "15.0.0"
      48  UNICODE_DATA = "UnicodeData%s.txt"
      49  COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
      50  EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
      51  UNIHAN = "Unihan%s.zip"
      52  DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
      53  DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
      54  LINE_BREAK = "LineBreak%s.txt"
      55  NAME_ALIASES = "NameAliases%s.txt"
      56  NAMED_SEQUENCES = "NamedSequences%s.txt"
      57  SPECIAL_CASING = "SpecialCasing%s.txt"
      58  CASE_FOLDING = "CaseFolding%s.txt"
      59  
      60  # Private Use Areas -- in planes 1, 15, 16
      61  PUA_1 = range(0xE000, 0xF900)
      62  PUA_15 = range(0xF0000, 0xFFFFE)
      63  PUA_16 = range(0x100000, 0x10FFFE)
      64  
      65  # we use this ranges of PUA_15 to store name aliases and named sequences
      66  NAME_ALIASES_START = 0xF0000
      67  NAMED_SEQUENCES_START = 0xF0200
      68  
      69  old_versions = ["3.2.0"]
      70  
      71  CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
      72      "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
      73      "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
      74      "So" ]
      75  
      76  BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
      77      "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
      78      "ON", "LRI", "RLI", "FSI", "PDI" ]
      79  
      80  # "N" needs to be the first entry, see the comment in makeunicodedata
      81  EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
      82  
      83  MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
      84  
      85  # note: should match definitions in Objects/unicodectype.c
      86  ALPHA_MASK = 0x01
      87  DECIMAL_MASK = 0x02
      88  DIGIT_MASK = 0x04
      89  LOWER_MASK = 0x08
      90  LINEBREAK_MASK = 0x10
      91  SPACE_MASK = 0x20
      92  TITLE_MASK = 0x40
      93  UPPER_MASK = 0x80
      94  XID_START_MASK = 0x100
      95  XID_CONTINUE_MASK = 0x200
      96  PRINTABLE_MASK = 0x400
      97  NUMERIC_MASK = 0x800
      98  CASE_IGNORABLE_MASK = 0x1000
      99  CASED_MASK = 0x2000
     100  EXTENDED_CASE_MASK = 0x4000
     101  
     102  # these ranges need to match unicodedata.c:is_unified_ideograph
     103  cjk_ranges = [
     104      ('3400', '4DBF'),
     105      ('4E00', '9FFF'),
     106      ('20000', '2A6DF'),
     107      ('2A700', '2B739'),
     108      ('2B740', '2B81D'),
     109      ('2B820', '2CEA1'),
     110      ('2CEB0', '2EBE0'),
     111      ('30000', '3134A'),
     112      ('31350', '323AF'),
     113  ]
     114  
     115  
     116  def maketables(trace=0):
     117  
     118      print("--- Reading", UNICODE_DATA % "", "...")
     119  
     120      unicode = UnicodeData(UNIDATA_VERSION)
     121  
     122      print(len(list(filter(None, unicode.table))), "characters")
     123  
     124      for version in old_versions:
     125          print("--- Reading", UNICODE_DATA % ("-"+version), "...")
     126          old_unicode = UnicodeData(version, cjk_check=False)
     127          print(len(list(filter(None, old_unicode.table))), "characters")
     128          merge_old_version(version, unicode, old_unicode)
     129  
     130      makeunicodename(unicode, trace)
     131      makeunicodedata(unicode, trace)
     132      makeunicodetype(unicode, trace)
     133  
     134  
     135  # --------------------------------------------------------------------
     136  # unicode character properties
     137  
     138  def makeunicodedata(unicode, trace):
     139  
     140      # the default value of east_asian_width is "N", for unassigned code points
     141      # not mentioned in EastAsianWidth.txt
     142      # in addition there are some reserved but unassigned code points in CJK
     143      # ranges that are classified as "W". code points in private use areas
     144      # have a width of "A". both of these have entries in
     145      # EastAsianWidth.txt
     146      # see https://unicode.org/reports/tr11/#Unassigned
     147      assert EASTASIANWIDTH_NAMES[0] == "N"
     148      dummy = (0, 0, 0, 0, 0, 0)
     149      table = [dummy]
     150      cache = {0: dummy}
     151      index = [0] * len(unicode.chars)
     152  
     153      FILE = "Modules/unicodedata_db.h"
     154  
     155      print("--- Preparing", FILE, "...")
     156  
     157      # 1) database properties
     158  
     159      for char in unicode.chars:
     160          record = unicode.table[char]
     161          if record:
     162              # extract database properties
     163              category = CATEGORY_NAMES.index(record.general_category)
     164              combining = int(record.canonical_combining_class)
     165              bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
     166              mirrored = record.bidi_mirrored == "Y"
     167              eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
     168              normalizationquickcheck = record.quick_check
     169              item = (
     170                  category, combining, bidirectional, mirrored, eastasianwidth,
     171                  normalizationquickcheck
     172                  )
     173          elif unicode.widths[char] is not None:
     174              # an unassigned but reserved character, with a known
     175              # east_asian_width
     176              eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
     177              item = (0, 0, 0, 0, eastasianwidth, 0)
     178          else:
     179              continue
     180  
     181          # add entry to index and item tables
     182          i = cache.get(item)
     183          if i is None:
     184              cache[item] = i = len(table)
     185              table.append(item)
     186          index[char] = i
     187  
     188      # 2) decomposition data
     189  
     190      decomp_data_cache = {}
     191      decomp_data = [0]
     192      decomp_prefix = [""]
     193      decomp_index = [0] * len(unicode.chars)
     194      decomp_size = 0
     195  
     196      comp_pairs = []
     197      comp_first = [None] * len(unicode.chars)
     198      comp_last = [None] * len(unicode.chars)
     199  
     200      for char in unicode.chars:
     201          record = unicode.table[char]
     202          if record:
     203              if record.decomposition_type:
     204                  decomp = record.decomposition_type.split()
     205                  if len(decomp) > 19:
     206                      raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
     207                  # prefix
     208                  if decomp[0][0] == "<":
     209                      prefix = decomp.pop(0)
     210                  else:
     211                      prefix = ""
     212                  try:
     213                      i = decomp_prefix.index(prefix)
     214                  except ValueError:
     215                      i = len(decomp_prefix)
     216                      decomp_prefix.append(prefix)
     217                  prefix = i
     218                  assert prefix < 256
     219                  # content
     220                  decomp = [prefix + (len(decomp)<<8)] + [int(s, 16) for s in decomp]
     221                  # Collect NFC pairs
     222                  if not prefix and len(decomp) == 3 and \
     223                     char not in unicode.exclusions and \
     224                     unicode.table[decomp[1]].canonical_combining_class == "0":
     225                      p, l, r = decomp
     226                      comp_first[l] = 1
     227                      comp_last[r] = 1
     228                      comp_pairs.append((l,r,char))
     229                  key = tuple(decomp)
     230                  i = decomp_data_cache.get(key, -1)
     231                  if i == -1:
     232                      i = len(decomp_data)
     233                      decomp_data.extend(decomp)
     234                      decomp_size = decomp_size + len(decomp) * 2
     235                      decomp_data_cache[key] = i
     236                  else:
     237                      assert decomp_data[i:i+len(decomp)] == decomp
     238              else:
     239                  i = 0
     240              decomp_index[char] = i
     241  
     242      f = l = 0
     243      comp_first_ranges = []
     244      comp_last_ranges = []
     245      prev_f = prev_l = None
     246      for i in unicode.chars:
     247          if comp_first[i] is not None:
     248              comp_first[i] = f
     249              f += 1
     250              if prev_f is None:
     251                  prev_f = (i,i)
     252              elif prev_f[1]+1 == i:
     253                  prev_f = prev_f[0],i
     254              else:
     255                  comp_first_ranges.append(prev_f)
     256                  prev_f = (i,i)
     257          if comp_last[i] is not None:
     258              comp_last[i] = l
     259              l += 1
     260              if prev_l is None:
     261                  prev_l = (i,i)
     262              elif prev_l[1]+1 == i:
     263                  prev_l = prev_l[0],i
     264              else:
     265                  comp_last_ranges.append(prev_l)
     266                  prev_l = (i,i)
     267      comp_first_ranges.append(prev_f)
     268      comp_last_ranges.append(prev_l)
     269      total_first = f
     270      total_last = l
     271  
     272      comp_data = [0]*(total_first*total_last)
     273      for f,l,char in comp_pairs:
     274          f = comp_first[f]
     275          l = comp_last[l]
     276          comp_data[f*total_last+l] = char
     277  
     278      print(len(table), "unique properties")
     279      print(len(decomp_prefix), "unique decomposition prefixes")
     280      print(len(decomp_data), "unique decomposition entries:", end=' ')
     281      print(decomp_size, "bytes")
     282      print(total_first, "first characters in NFC")
     283      print(total_last, "last characters in NFC")
     284      print(len(comp_pairs), "NFC pairs")
     285  
     286      print("--- Writing", FILE, "...")
     287  
     288      with open(FILE, "w") as fp:
     289          fprint = partial(print, file=fp)
     290  
     291          fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
     292          fprint()
     293          fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
     294          fprint("/* a list of unique database records */")
     295          fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
     296          for item in table:
     297              fprint("    {%d, %d, %d, %d, %d, %d}," % item)
     298          fprint("};")
     299          fprint()
     300  
     301          fprint("/* Reindexing of NFC first characters. */")
     302          fprint("#define TOTAL_FIRST",total_first)
     303          fprint("#define TOTAL_LAST",total_last)
     304          fprint("struct reindex{int start;short count,index;};")
     305          fprint("static struct reindex nfc_first[] = {")
     306          for start,end in comp_first_ranges:
     307              fprint("    { %d, %d, %d}," % (start,end-start,comp_first[start]))
     308          fprint("    {0,0,0}")
     309          fprint("};\n")
     310          fprint("static struct reindex nfc_last[] = {")
     311          for start,end in comp_last_ranges:
     312              fprint("  { %d, %d, %d}," % (start,end-start,comp_last[start]))
     313          fprint("  {0,0,0}")
     314          fprint("};\n")
     315  
     316          # FIXME: <fl> the following tables could be made static, and
     317          # the support code moved into unicodedatabase.c
     318  
     319          fprint("/* string literals */")
     320          fprint("const char *_PyUnicode_CategoryNames[] = {")
     321          for name in CATEGORY_NAMES:
     322              fprint("    \"%s\"," % name)
     323          fprint("    NULL")
     324          fprint("};")
     325  
     326          fprint("const char *_PyUnicode_BidirectionalNames[] = {")
     327          for name in BIDIRECTIONAL_NAMES:
     328              fprint("    \"%s\"," % name)
     329          fprint("    NULL")
     330          fprint("};")
     331  
     332          fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
     333          for name in EASTASIANWIDTH_NAMES:
     334              fprint("    \"%s\"," % name)
     335          fprint("    NULL")
     336          fprint("};")
     337  
     338          fprint("static const char *decomp_prefix[] = {")
     339          for name in decomp_prefix:
     340              fprint("    \"%s\"," % name)
     341          fprint("    NULL")
     342          fprint("};")
     343  
     344          # split record index table
     345          index1, index2, shift = splitbins(index, trace)
     346  
     347          fprint("/* index tables for the database records */")
     348          fprint("#define SHIFT", shift)
     349          Array("index1", index1).dump(fp, trace)
     350          Array("index2", index2).dump(fp, trace)
     351  
     352          # split decomposition index table
     353          index1, index2, shift = splitbins(decomp_index, trace)
     354  
     355          fprint("/* decomposition data */")
     356          Array("decomp_data", decomp_data).dump(fp, trace)
     357  
     358          fprint("/* index tables for the decomposition data */")
     359          fprint("#define DECOMP_SHIFT", shift)
     360          Array("decomp_index1", index1).dump(fp, trace)
     361          Array("decomp_index2", index2).dump(fp, trace)
     362  
     363          index, index2, shift = splitbins(comp_data, trace)
     364          fprint("/* NFC pairs */")
     365          fprint("#define COMP_SHIFT", shift)
     366          Array("comp_index", index).dump(fp, trace)
     367          Array("comp_data", index2).dump(fp, trace)
     368  
     369          # Generate delta tables for old versions
     370          for version, table, normalization in unicode.changed:
     371              cversion = version.replace(".","_")
     372              records = [table[0]]
     373              cache = {table[0]:0}
     374              index = [0] * len(table)
     375              for i, record in enumerate(table):
     376                  try:
     377                      index[i] = cache[record]
     378                  except KeyError:
     379                      index[i] = cache[record] = len(records)
     380                      records.append(record)
     381              index1, index2, shift = splitbins(index, trace)
     382              fprint("static const change_record change_records_%s[] = {" % cversion)
     383              for record in records:
     384                  fprint("    { %s }," % ", ".join(map(str,record)))
     385              fprint("};")
     386              Array("changes_%s_index" % cversion, index1).dump(fp, trace)
     387              Array("changes_%s_data" % cversion, index2).dump(fp, trace)
     388              fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
     389              fprint("{")
     390              fprint("    int index;")
     391              fprint("    if (n >= 0x110000) index = 0;")
     392              fprint("    else {")
     393              fprint("        index = changes_%s_index[n>>%d];" % (cversion, shift))
     394              fprint("        index = changes_%s_data[(index<<%d)+(n & %d)];" % \
     395                     (cversion, shift, ((1<<shift)-1)))
     396              fprint("    }")
     397              fprint("    return change_records_%s+index;" % cversion)
     398              fprint("}\n")
     399              fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
     400              fprint("{")
     401              fprint("    switch(n) {")
     402              for k, v in normalization:
     403                  fprint("    case %s: return 0x%s;" % (hex(k), v))
     404              fprint("    default: return 0;")
     405              fprint("    }\n}\n")
     406  
     407  
     408  # --------------------------------------------------------------------
     409  # unicode character type tables
     410  
     411  def makeunicodetype(unicode, trace):
     412  
     413      FILE = "Objects/unicodetype_db.h"
     414  
     415      print("--- Preparing", FILE, "...")
     416  
     417      # extract unicode types
     418      dummy = (0, 0, 0, 0, 0, 0)
     419      table = [dummy]
     420      cache = {dummy: 0}
     421      index = [0] * len(unicode.chars)
     422      numeric = {}
     423      spaces = []
     424      linebreaks = []
     425      extra_casing = []
     426  
     427      for char in unicode.chars:
     428          record = unicode.table[char]
     429          if record:
     430              # extract database properties
     431              category = record.general_category
     432              bidirectional = record.bidi_class
     433              properties = record.binary_properties
     434              flags = 0
     435              if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
     436                  flags |= ALPHA_MASK
     437              if "Lowercase" in properties:
     438                  flags |= LOWER_MASK
     439              if 'Line_Break' in properties or bidirectional == "B":
     440                  flags |= LINEBREAK_MASK
     441                  linebreaks.append(char)
     442              if category == "Zs" or bidirectional in ("WS", "B", "S"):
     443                  flags |= SPACE_MASK
     444                  spaces.append(char)
     445              if category == "Lt":
     446                  flags |= TITLE_MASK
     447              if "Uppercase" in properties:
     448                  flags |= UPPER_MASK
     449              if char == ord(" ") or category[0] not in ("C", "Z"):
     450                  flags |= PRINTABLE_MASK
     451              if "XID_Start" in properties:
     452                  flags |= XID_START_MASK
     453              if "XID_Continue" in properties:
     454                  flags |= XID_CONTINUE_MASK
     455              if "Cased" in properties:
     456                  flags |= CASED_MASK
     457              if "Case_Ignorable" in properties:
     458                  flags |= CASE_IGNORABLE_MASK
     459              sc = unicode.special_casing.get(char)
     460              cf = unicode.case_folding.get(char, [char])
     461              if record.simple_uppercase_mapping:
     462                  upper = int(record.simple_uppercase_mapping, 16)
     463              else:
     464                  upper = char
     465              if record.simple_lowercase_mapping:
     466                  lower = int(record.simple_lowercase_mapping, 16)
     467              else:
     468                  lower = char
     469              if record.simple_titlecase_mapping:
     470                  title = int(record.simple_titlecase_mapping, 16)
     471              else:
     472                  title = upper
     473              if sc is None and cf != [lower]:
     474                  sc = ([lower], [title], [upper])
     475              if sc is None:
     476                  if upper == lower == title:
     477                      upper = lower = title = 0
     478                  else:
     479                      upper = upper - char
     480                      lower = lower - char
     481                      title = title - char
     482                      assert (abs(upper) <= 2147483647 and
     483                              abs(lower) <= 2147483647 and
     484                              abs(title) <= 2147483647)
     485              else:
     486                  # This happens either when some character maps to more than one
     487                  # character in uppercase, lowercase, or titlecase or the
     488                  # casefolded version of the character is different from the
     489                  # lowercase. The extra characters are stored in a different
     490                  # array.
     491                  flags |= EXTENDED_CASE_MASK
     492                  lower = len(extra_casing) | (len(sc[0]) << 24)
     493                  extra_casing.extend(sc[0])
     494                  if cf != sc[0]:
     495                      lower |= len(cf) << 20
     496                      extra_casing.extend(cf)
     497                  upper = len(extra_casing) | (len(sc[2]) << 24)
     498                  extra_casing.extend(sc[2])
     499                  # Title is probably equal to upper.
     500                  if sc[1] == sc[2]:
     501                      title = upper
     502                  else:
     503                      title = len(extra_casing) | (len(sc[1]) << 24)
     504                      extra_casing.extend(sc[1])
     505              # decimal digit, integer digit
     506              decimal = 0
     507              if record.decomposition_mapping:
     508                  flags |= DECIMAL_MASK
     509                  decimal = int(record.decomposition_mapping)
     510              digit = 0
     511              if record.numeric_type:
     512                  flags |= DIGIT_MASK
     513                  digit = int(record.numeric_type)
     514              if record.numeric_value:
     515                  flags |= NUMERIC_MASK
     516                  numeric.setdefault(record.numeric_value, []).append(char)
     517              item = (
     518                  upper, lower, title, decimal, digit, flags
     519                  )
     520              # add entry to index and item tables
     521              i = cache.get(item)
     522              if i is None:
     523                  cache[item] = i = len(table)
     524                  table.append(item)
     525              index[char] = i
     526  
     527      print(len(table), "unique character type entries")
     528      print(sum(map(len, numeric.values())), "numeric code points")
     529      print(len(spaces), "whitespace code points")
     530      print(len(linebreaks), "linebreak code points")
     531      print(len(extra_casing), "extended case array")
     532  
     533      print("--- Writing", FILE, "...")
     534  
     535      with open(FILE, "w") as fp:
     536          fprint = partial(print, file=fp)
     537  
     538          fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
     539          fprint()
     540          fprint("/* a list of unique character type descriptors */")
     541          fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
     542          for item in table:
     543              fprint("    {%d, %d, %d, %d, %d, %d}," % item)
     544          fprint("};")
     545          fprint()
     546  
     547          fprint("/* extended case mappings */")
     548          fprint()
     549          fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
     550          for c in extra_casing:
     551              fprint("    %d," % c)
     552          fprint("};")
     553          fprint()
     554  
     555          # split decomposition index table
     556          index1, index2, shift = splitbins(index, trace)
     557  
     558          fprint("/* type indexes */")
     559          fprint("#define SHIFT", shift)
     560          Array("index1", index1).dump(fp, trace)
     561          Array("index2", index2).dump(fp, trace)
     562  
     563          # Generate code for _PyUnicode_ToNumeric()
     564          numeric_items = sorted(numeric.items())
     565          fprint('/* Returns the numeric value as double for Unicode characters')
     566          fprint(' * having this property, -1.0 otherwise.')
     567          fprint(' */')
     568          fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
     569          fprint('{')
     570          fprint('    switch (ch) {')
     571          for value, codepoints in numeric_items:
     572              # Turn text into float literals
     573              parts = value.split('/')
     574              parts = [repr(float(part)) for part in parts]
     575              value = '/'.join(parts)
     576  
     577              codepoints.sort()
     578              for codepoint in codepoints:
     579                  fprint('    case 0x%04X:' % (codepoint,))
     580              fprint('        return (double) %s;' % (value,))
     581          fprint('    }')
     582          fprint('    return -1.0;')
     583          fprint('}')
     584          fprint()
     585  
     586          # Generate code for _PyUnicode_IsWhitespace()
     587          fprint("/* Returns 1 for Unicode characters having the bidirectional")
     588          fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
     589          fprint(" */")
     590          fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
     591          fprint('{')
     592          fprint('    switch (ch) {')
     593  
     594          for codepoint in sorted(spaces):
     595              fprint('    case 0x%04X:' % (codepoint,))
     596          fprint('        return 1;')
     597  
     598          fprint('    }')
     599          fprint('    return 0;')
     600          fprint('}')
     601          fprint()
     602  
     603          # Generate code for _PyUnicode_IsLinebreak()
     604          fprint("/* Returns 1 for Unicode characters having the line break")
     605          fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
     606          fprint(" * type 'B', 0 otherwise.")
     607          fprint(" */")
     608          fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
     609          fprint('{')
     610          fprint('    switch (ch) {')
     611          for codepoint in sorted(linebreaks):
     612              fprint('    case 0x%04X:' % (codepoint,))
     613          fprint('        return 1;')
     614  
     615          fprint('    }')
     616          fprint('    return 0;')
     617          fprint('}')
     618          fprint()
     619  
     620  
     621  # --------------------------------------------------------------------
     622  # unicode name database
     623  
     624  def makeunicodename(unicode, trace):
     625  
     626      FILE = "Modules/unicodename_db.h"
     627  
     628      print("--- Preparing", FILE, "...")
     629  
     630      # collect names
     631      names = [None] * len(unicode.chars)
     632  
     633      for char in unicode.chars:
     634          record = unicode.table[char]
     635          if record:
     636              name = record.name.strip()
     637              if name and name[0] != "<":
     638                  names[char] = name + chr(0)
     639  
     640      print(len([n for n in names if n is not None]), "distinct names")
     641  
     642      # collect unique words from names (note that we differ between
     643      # words inside a sentence, and words ending a sentence.  the
     644      # latter includes the trailing null byte.
     645  
     646      words = {}
     647      n = b = 0
     648      for char in unicode.chars:
     649          name = names[char]
     650          if name:
     651              w = name.split()
     652              b = b + len(name)
     653              n = n + len(w)
     654              for w in w:
     655                  l = words.get(w)
     656                  if l:
     657                      l.append(None)
     658                  else:
     659                      words[w] = [len(words)]
     660  
     661      print(n, "words in text;", b, "bytes")
     662  
     663      wordlist = list(words.items())
     664  
     665      # sort on falling frequency, then by name
     666      def word_key(a):
     667          aword, alist = a
     668          return -len(alist), aword
     669      wordlist.sort(key=word_key)
     670  
     671      # figure out how many phrasebook escapes we need
     672      escapes = 0
     673      while escapes * 256 < len(wordlist):
     674          escapes = escapes + 1
     675      print(escapes, "escapes")
     676  
     677      short = 256 - escapes
     678  
     679      assert short > 0
     680  
     681      print(short, "short indexes in lexicon")
     682  
     683      # statistics
     684      n = 0
     685      for i in range(short):
     686          n = n + len(wordlist[i][1])
     687      print(n, "short indexes in phrasebook")
     688  
     689      # pick the most commonly used words, and sort the rest on falling
     690      # length (to maximize overlap)
     691  
     692      wordlist, wordtail = wordlist[:short], wordlist[short:]
     693      wordtail.sort(key=lambda a: a[0], reverse=True)
     694      wordlist.extend(wordtail)
     695  
     696      # generate lexicon from words
     697  
     698      lexicon_offset = [0]
     699      lexicon = ""
     700      words = {}
     701  
     702      # build a lexicon string
     703      offset = 0
     704      for w, x in wordlist:
     705          # encoding: bit 7 indicates last character in word (chr(128)
     706          # indicates the last character in an entire string)
     707          ww = w[:-1] + chr(ord(w[-1])+128)
     708          # reuse string tails, when possible
     709          o = lexicon.find(ww)
     710          if o < 0:
     711              o = offset
     712              lexicon = lexicon + ww
     713              offset = offset + len(w)
     714          words[w] = len(lexicon_offset)
     715          lexicon_offset.append(o)
     716  
     717      lexicon = list(map(ord, lexicon))
     718  
     719      # generate phrasebook from names and lexicon
     720      phrasebook = [0]
     721      phrasebook_offset = [0] * len(unicode.chars)
     722      for char in unicode.chars:
     723          name = names[char]
     724          if name:
     725              w = name.split()
     726              phrasebook_offset[char] = len(phrasebook)
     727              for w in w:
     728                  i = words[w]
     729                  if i < short:
     730                      phrasebook.append(i)
     731                  else:
     732                      # store as two bytes
     733                      phrasebook.append((i>>8) + short)
     734                      phrasebook.append(i&255)
     735  
     736      assert getsize(phrasebook) == 1
     737  
     738      #
     739      # unicode name hash table
     740  
     741      # extract names
     742      data = []
     743      for char in unicode.chars:
     744          record = unicode.table[char]
     745          if record:
     746              name = record.name.strip()
     747              if name and name[0] != "<":
     748                  data.append((name, char))
     749  
     750      # the magic number 47 was chosen to minimize the number of
     751      # collisions on the current data set.  if you like, change it
     752      # and see what happens...
     753  
     754      codehash = Hash("code", data, 47)
     755  
     756      print("--- Writing", FILE, "...")
     757  
     758      with open(FILE, "w") as fp:
     759          fprint = partial(print, file=fp)
     760  
     761          fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
     762          fprint()
     763          fprint("#define NAME_MAXLEN", 256)
     764          fprint()
     765          fprint("/* lexicon */")
     766          Array("lexicon", lexicon).dump(fp, trace)
     767          Array("lexicon_offset", lexicon_offset).dump(fp, trace)
     768  
     769          # split decomposition index table
     770          offset1, offset2, shift = splitbins(phrasebook_offset, trace)
     771  
     772          fprint("/* code->name phrasebook */")
     773          fprint("#define phrasebook_shift", shift)
     774          fprint("#define phrasebook_short", short)
     775  
     776          Array("phrasebook", phrasebook).dump(fp, trace)
     777          Array("phrasebook_offset1", offset1).dump(fp, trace)
     778          Array("phrasebook_offset2", offset2).dump(fp, trace)
     779  
     780          fprint("/* name->code dictionary */")
     781          codehash.dump(fp, trace)
     782  
     783          fprint()
     784          fprint('static const unsigned int aliases_start = %#x;' %
     785                 NAME_ALIASES_START)
     786          fprint('static const unsigned int aliases_end = %#x;' %
     787                 (NAME_ALIASES_START + len(unicode.aliases)))
     788  
     789          fprint('static const unsigned int name_aliases[] = {')
     790          for name, codepoint in unicode.aliases:
     791              fprint('    0x%04X,' % codepoint)
     792          fprint('};')
     793  
     794          # In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
     795          # so we are using Py_UCS2 seq[4].  This needs to be updated if longer
     796          # sequences or sequences with non-BMP chars are added.
     797          # unicodedata_lookup should be adapted too.
     798          fprint(dedent("""
     799              typedef struct NamedSequence {
     800                  int seqlen;
     801                  Py_UCS2 seq[4];
     802              } named_sequence;
     803              """))
     804  
     805          fprint('static const unsigned int named_sequences_start = %#x;' %
     806                 NAMED_SEQUENCES_START)
     807          fprint('static const unsigned int named_sequences_end = %#x;' %
     808                 (NAMED_SEQUENCES_START + len(unicode.named_sequences)))
     809  
     810          fprint('static const named_sequence named_sequences[] = {')
     811          for name, sequence in unicode.named_sequences:
     812              seq_str = ', '.join('0x%04X' % cp for cp in sequence)
     813              fprint('    {%d, {%s}},' % (len(sequence), seq_str))
     814          fprint('};')
     815  
     816  
     817  def merge_old_version(version, new, old):
     818      # Changes to exclusion file not implemented yet
     819      if old.exclusions != new.exclusions:
     820          raise NotImplementedError("exclusions differ")
     821  
     822      # In these change records, 0xFF means "no change"
     823      bidir_changes = [0xFF]*0x110000
     824      category_changes = [0xFF]*0x110000
     825      decimal_changes = [0xFF]*0x110000
     826      mirrored_changes = [0xFF]*0x110000
     827      east_asian_width_changes = [0xFF]*0x110000
     828      # In numeric data, 0 means "no change",
     829      # -1 means "did not have a numeric value
     830      numeric_changes = [0] * 0x110000
     831      # normalization_changes is a list of key-value pairs
     832      normalization_changes = []
     833      for i in range(0x110000):
     834          if new.table[i] is None:
     835              # Characters unassigned in the new version ought to
     836              # be unassigned in the old one
     837              assert old.table[i] is None
     838              continue
     839          # check characters unassigned in the old version
     840          if old.table[i] is None:
     841              # category 0 is "unassigned"
     842              category_changes[i] = 0
     843              continue
     844          # check characters that differ
     845          if old.table[i] != new.table[i]:
     846              for k, field in enumerate(dataclasses.fields(UcdRecord)):
     847                  value = getattr(old.table[i], field.name)
     848                  new_value = getattr(new.table[i], field.name)
     849                  if value != new_value:
     850                      if k == 1 and i in PUA_15:
     851                          # the name is not set in the old.table, but in the
     852                          # new.table we are using it for aliases and named seq
     853                          assert value == ''
     854                      elif k == 2:
     855                          category_changes[i] = CATEGORY_NAMES.index(value)
     856                      elif k == 4:
     857                          bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
     858                      elif k == 5:
     859                          # We assume that all normalization changes are in 1:1 mappings
     860                          assert " " not in value
     861                          normalization_changes.append((i, value))
     862                      elif k == 6:
     863                          # we only support changes where the old value is a single digit
     864                          assert value in "0123456789"
     865                          decimal_changes[i] = int(value)
     866                      elif k == 8:
     867                          # Since 0 encodes "no change", the old value is better not 0
     868                          if not value:
     869                              numeric_changes[i] = -1
     870                          else:
     871                              numeric_changes[i] = float(value)
     872                              assert numeric_changes[i] not in (0, -1)
     873                      elif k == 9:
     874                          if value == 'Y':
     875                              mirrored_changes[i] = '1'
     876                          else:
     877                              mirrored_changes[i] = '0'
     878                      elif k == 11:
     879                          # change to ISO comment, ignore
     880                          pass
     881                      elif k == 12:
     882                          # change to simple uppercase mapping; ignore
     883                          pass
     884                      elif k == 13:
     885                          # change to simple lowercase mapping; ignore
     886                          pass
     887                      elif k == 14:
     888                          # change to simple titlecase mapping; ignore
     889                          pass
     890                      elif k == 15:
     891                          # change to east asian width
     892                          east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
     893                      elif k == 16:
     894                          # derived property changes; not yet
     895                          pass
     896                      elif k == 17:
     897                          # normalization quickchecks are not performed
     898                          # for older versions
     899                          pass
     900                      else:
     901                          class ESC[4;38;5;81mDifference(ESC[4;38;5;149mException):pass
     902                          raise Difference(hex(i), k, old.table[i], new.table[i])
     903      new.changed.append((version, list(zip(bidir_changes, category_changes,
     904                                            decimal_changes, mirrored_changes,
     905                                            east_asian_width_changes,
     906                                            numeric_changes)),
     907                          normalization_changes))
     908  
     909  
     910  DATA_DIR = os.path.join('Tools', 'unicode', 'data')
     911  
     912  def open_data(template, version):
     913      local = os.path.join(DATA_DIR, template % ('-'+version,))
     914      if not os.path.exists(local):
     915          import urllib.request
     916          if version == '3.2.0':
     917              # irregular url structure
     918              url = ('https://www.unicode.org/Public/3.2-Update/'+template) % ('-'+version,)
     919          else:
     920              url = ('https://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
     921          os.makedirs(DATA_DIR, exist_ok=True)
     922          urllib.request.urlretrieve(url, filename=local)
     923      if local.endswith('.txt'):
     924          return open(local, encoding='utf-8')
     925      else:
     926          # Unihan.zip
     927          return open(local, 'rb')
     928  
     929  
     930  def expand_range(char_range: str) -> Iterator[int]:
     931      '''
     932      Parses ranges of code points, as described in UAX #44:
     933        https://www.unicode.org/reports/tr44/#Code_Point_Ranges
     934      '''
     935      if '..' in char_range:
     936          first, last = [int(c, 16) for c in char_range.split('..')]
     937      else:
     938          first = last = int(char_range, 16)
     939      for char in range(first, last+1):
     940          yield char
     941  
     942  
     943  class ESC[4;38;5;81mUcdFile:
     944      '''
     945      A file in the standard format of the UCD.
     946  
     947      See: https://www.unicode.org/reports/tr44/#Format_Conventions
     948  
     949      Note that, as described there, the Unihan data files have their
     950      own separate format.
     951      '''
     952  
     953      def __init__(self, template: str, version: str) -> None:
     954          self.template = template
     955          self.version = version
     956  
     957      def records(self) -> Iterator[List[str]]:
     958          with open_data(self.template, self.version) as file:
     959              for line in file:
     960                  line = line.split('#', 1)[0].strip()
     961                  if not line:
     962                      continue
     963                  yield [field.strip() for field in line.split(';')]
     964  
     965      def __iter__(self) -> Iterator[List[str]]:
     966          return self.records()
     967  
     968      def expanded(self) -> Iterator[Tuple[int, List[str]]]:
     969          for record in self.records():
     970              char_range, rest = record[0], record[1:]
     971              for char in expand_range(char_range):
     972                  yield char, rest
     973  
     974  
     975  @dataclasses.dataclass
     976  class ESC[4;38;5;81mUcdRecord:
     977      # 15 fields from UnicodeData.txt .  See:
     978      #   https://www.unicode.org/reports/tr44/#UnicodeData.txt
     979      codepoint: str
     980      name: str
     981      general_category: str
     982      canonical_combining_class: str
     983      bidi_class: str
     984      decomposition_type: str
     985      decomposition_mapping: str
     986      numeric_type: str
     987      numeric_value: str
     988      bidi_mirrored: str
     989      unicode_1_name: str  # obsolete
     990      iso_comment: str  # obsolete
     991      simple_uppercase_mapping: str
     992      simple_lowercase_mapping: str
     993      simple_titlecase_mapping: str
     994  
     995      # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
     996      east_asian_width: Optional[str]
     997  
     998      # Binary properties, as a set of those that are true.
     999      # Taken from multiple files:
    1000      #   https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
    1001      #   https://www.unicode.org/reports/tr44/#LineBreak.txt
    1002      binary_properties: Set[str]
    1003  
    1004      # The Quick_Check properties related to normalization:
    1005      #   https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
    1006      # We store them as a bitmask.
    1007      quick_check: int
    1008  
    1009  
    1010  def from_row(row: List[str]) -> UcdRecord:
    1011      return UcdRecord(*row, None, set(), 0)
    1012  
    1013  
    1014  # --------------------------------------------------------------------
    1015  # the following support code is taken from the unidb utilities
    1016  # Copyright (c) 1999-2000 by Secret Labs AB
    1017  
    1018  # load a unicode-data file from disk
    1019  
    1020  class ESC[4;38;5;81mUnicodeData:
    1021      # table: List[Optional[UcdRecord]]  # index is codepoint; None means unassigned
    1022  
    1023      def __init__(self, version, cjk_check=True):
    1024          self.changed = []
    1025          table = [None] * 0x110000
    1026          for s in UcdFile(UNICODE_DATA, version):
    1027              char = int(s[0], 16)
    1028              table[char] = from_row(s)
    1029  
    1030          cjk_ranges_found = []
    1031  
    1032          # expand first-last ranges
    1033          field = None
    1034          for i in range(0, 0x110000):
    1035              # The file UnicodeData.txt has its own distinct way of
    1036              # expressing ranges.  See:
    1037              #   https://www.unicode.org/reports/tr44/#Code_Point_Ranges
    1038              s = table[i]
    1039              if s:
    1040                  if s.name[-6:] == "First>":
    1041                      s.name = ""
    1042                      field = dataclasses.astuple(s)[:15]
    1043                  elif s.name[-5:] == "Last>":
    1044                      if s.name.startswith("<CJK Ideograph"):
    1045                          cjk_ranges_found.append((field[0],
    1046                                                   s.codepoint))
    1047                      s.name = ""
    1048                      field = None
    1049              elif field:
    1050                  table[i] = from_row(('%X' % i,) + field[1:])
    1051          if cjk_check and cjk_ranges != cjk_ranges_found:
    1052              raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
    1053  
    1054          # public attributes
    1055          self.filename = UNICODE_DATA % ''
    1056          self.table = table
    1057          self.chars = list(range(0x110000)) # unicode 3.2
    1058  
    1059          # check for name aliases and named sequences, see #12753
    1060          # aliases and named sequences are not in 3.2.0
    1061          if version != '3.2.0':
    1062              self.aliases = []
    1063              # store aliases in the Private Use Area 15, in range U+F0000..U+F00FF,
    1064              # in order to take advantage of the compression and lookup
    1065              # algorithms used for the other characters
    1066              pua_index = NAME_ALIASES_START
    1067              for char, name, abbrev in UcdFile(NAME_ALIASES, version):
    1068                  char = int(char, 16)
    1069                  self.aliases.append((name, char))
    1070                  # also store the name in the PUA 1
    1071                  self.table[pua_index].name = name
    1072                  pua_index += 1
    1073              assert pua_index - NAME_ALIASES_START == len(self.aliases)
    1074  
    1075              self.named_sequences = []
    1076              # store named sequences in the PUA 1, in range U+F0100..,
    1077              # in order to take advantage of the compression and lookup
    1078              # algorithms used for the other characters.
    1079  
    1080              assert pua_index < NAMED_SEQUENCES_START
    1081              pua_index = NAMED_SEQUENCES_START
    1082              for name, chars in UcdFile(NAMED_SEQUENCES, version):
    1083                  chars = tuple(int(char, 16) for char in chars.split())
    1084                  # check that the structure defined in makeunicodename is OK
    1085                  assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
    1086                  assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
    1087                      "the NamedSequence struct and in unicodedata_lookup")
    1088                  self.named_sequences.append((name, chars))
    1089                  # also store these in the PUA 1
    1090                  self.table[pua_index].name = name
    1091                  pua_index += 1
    1092              assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
    1093  
    1094          self.exclusions = {}
    1095          for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
    1096              char = int(char, 16)
    1097              self.exclusions[char] = 1
    1098  
    1099          widths = [None] * 0x110000
    1100          for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
    1101              widths[char] = width
    1102  
    1103          for i in range(0, 0x110000):
    1104              if table[i] is not None:
    1105                  table[i].east_asian_width = widths[i]
    1106          self.widths = widths
    1107  
    1108          for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
    1109              if table[char]:
    1110                  # Some properties (e.g. Default_Ignorable_Code_Point)
    1111                  # apply to unassigned code points; ignore them
    1112                  table[char].binary_properties.add(p)
    1113  
    1114          for char_range, value in UcdFile(LINE_BREAK, version):
    1115              if value not in MANDATORY_LINE_BREAKS:
    1116                  continue
    1117              for char in expand_range(char_range):
    1118                  table[char].binary_properties.add('Line_Break')
    1119  
    1120          # We only want the quickcheck properties
    1121          # Format: NF?_QC; Y(es)/N(o)/M(aybe)
    1122          # Yes is the default, hence only N and M occur
    1123          # In 3.2.0, the format was different (NF?_NO)
    1124          # The parsing will incorrectly determine these as
    1125          # "yes", however, unicodedata.c will not perform quickchecks
    1126          # for older versions, and no delta records will be created.
    1127          quickchecks = [0] * 0x110000
    1128          qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
    1129          for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
    1130              if len(s) < 2 or s[1] not in qc_order:
    1131                  continue
    1132              quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
    1133              quickcheck_shift = qc_order.index(s[1])*2
    1134              quickcheck <<= quickcheck_shift
    1135              for char in expand_range(s[0]):
    1136                  assert not (quickchecks[char]>>quickcheck_shift)&3
    1137                  quickchecks[char] |= quickcheck
    1138          for i in range(0, 0x110000):
    1139              if table[i] is not None:
    1140                  table[i].quick_check = quickchecks[i]
    1141  
    1142          with open_data(UNIHAN, version) as file:
    1143              zip = zipfile.ZipFile(file)
    1144              if version == '3.2.0':
    1145                  data = zip.open('Unihan-3.2.0.txt').read()
    1146              else:
    1147                  data = zip.open('Unihan_NumericValues.txt').read()
    1148          for line in data.decode("utf-8").splitlines():
    1149              if not line.startswith('U+'):
    1150                  continue
    1151              code, tag, value = line.split(None, 3)[:3]
    1152              if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
    1153                             'kOtherNumeric'):
    1154                  continue
    1155              value = value.strip().replace(',', '')
    1156              i = int(code[2:], 16)
    1157              # Patch the numeric field
    1158              if table[i] is not None:
    1159                  table[i].numeric_value = value
    1160  
    1161          sc = self.special_casing = {}
    1162          for data in UcdFile(SPECIAL_CASING, version):
    1163              if data[4]:
    1164                  # We ignore all conditionals (since they depend on
    1165                  # languages) except for one, which is hardcoded. See
    1166                  # handle_capital_sigma in unicodeobject.c.
    1167                  continue
    1168              c = int(data[0], 16)
    1169              lower = [int(char, 16) for char in data[1].split()]
    1170              title = [int(char, 16) for char in data[2].split()]
    1171              upper = [int(char, 16) for char in data[3].split()]
    1172              sc[c] = (lower, title, upper)
    1173  
    1174          cf = self.case_folding = {}
    1175          if version != '3.2.0':
    1176              for data in UcdFile(CASE_FOLDING, version):
    1177                  if data[1] in "CF":
    1178                      c = int(data[0], 16)
    1179                      cf[c] = [int(char, 16) for char in data[2].split()]
    1180  
    1181      def uselatin1(self):
    1182          # restrict character range to ISO Latin 1
    1183          self.chars = list(range(256))
    1184  
    1185  
    1186  # hash table tools
    1187  
    1188  # this is a straight-forward reimplementation of Python's built-in
    1189  # dictionary type, using a static data structure, and a custom string
    1190  # hash algorithm.
    1191  
    1192  def myhash(s, magic):
    1193      h = 0
    1194      for c in map(ord, s.upper()):
    1195          h = (h * magic) + c
    1196          ix = h & 0xff000000
    1197          if ix:
    1198              h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
    1199      return h
    1200  
    1201  
    1202  SIZES = [
    1203      (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
    1204      (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
    1205      (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
    1206      (2097152,5), (4194304,3), (8388608,33), (16777216,27)
    1207  ]
    1208  
    1209  
    1210  class ESC[4;38;5;81mHash:
    1211      def __init__(self, name, data, magic):
    1212          # turn a (key, value) list into a static hash table structure
    1213  
    1214          # determine table size
    1215          for size, poly in SIZES:
    1216              if size > len(data):
    1217                  poly = size + poly
    1218                  break
    1219          else:
    1220              raise AssertionError("ran out of polynomials")
    1221  
    1222          print(size, "slots in hash table")
    1223  
    1224          table = [None] * size
    1225  
    1226          mask = size-1
    1227  
    1228          n = 0
    1229  
    1230          hash = myhash
    1231  
    1232          # initialize hash table
    1233          for key, value in data:
    1234              h = hash(key, magic)
    1235              i = (~h) & mask
    1236              v = table[i]
    1237              if v is None:
    1238                  table[i] = value
    1239                  continue
    1240              incr = (h ^ (h >> 3)) & mask
    1241              if not incr:
    1242                  incr = mask
    1243              while 1:
    1244                  n = n + 1
    1245                  i = (i + incr) & mask
    1246                  v = table[i]
    1247                  if v is None:
    1248                      table[i] = value
    1249                      break
    1250                  incr = incr << 1
    1251                  if incr > mask:
    1252                      incr = incr ^ poly
    1253  
    1254          print(n, "collisions")
    1255          self.collisions = n
    1256  
    1257          for i in range(len(table)):
    1258              if table[i] is None:
    1259                  table[i] = 0
    1260  
    1261          self.data = Array(name + "_hash", table)
    1262          self.magic = magic
    1263          self.name = name
    1264          self.size = size
    1265          self.poly = poly
    1266  
    1267      def dump(self, file, trace):
    1268          # write data to file, as a C array
    1269          self.data.dump(file, trace)
    1270          file.write("#define %s_magic %d\n" % (self.name, self.magic))
    1271          file.write("#define %s_size %d\n" % (self.name, self.size))
    1272          file.write("#define %s_poly %d\n" % (self.name, self.poly))
    1273  
    1274  
    1275  # stuff to deal with arrays of unsigned integers
    1276  
    1277  class ESC[4;38;5;81mArray:
    1278  
    1279      def __init__(self, name, data):
    1280          self.name = name
    1281          self.data = data
    1282  
    1283      def dump(self, file, trace=0):
    1284          # write data to file, as a C array
    1285          size = getsize(self.data)
    1286          if trace:
    1287              print(self.name+":", size*len(self.data), "bytes", file=sys.stderr)
    1288          file.write("static const ")
    1289          if size == 1:
    1290              file.write("unsigned char")
    1291          elif size == 2:
    1292              file.write("unsigned short")
    1293          else:
    1294              file.write("unsigned int")
    1295          file.write(" " + self.name + "[] = {\n")
    1296          if self.data:
    1297              s = "    "
    1298              for item in self.data:
    1299                  i = str(item) + ", "
    1300                  if len(s) + len(i) > 78:
    1301                      file.write(s.rstrip() + "\n")
    1302                      s = "    " + i
    1303                  else:
    1304                      s = s + i
    1305              if s.strip():
    1306                  file.write(s.rstrip() + "\n")
    1307          file.write("};\n\n")
    1308  
    1309  
    1310  def getsize(data):
    1311      # return smallest possible integer size for the given array
    1312      maxdata = max(data)
    1313      if maxdata < 256:
    1314          return 1
    1315      elif maxdata < 65536:
    1316          return 2
    1317      else:
    1318          return 4
    1319  
    1320  
    1321  def splitbins(t, trace=0):
    1322      """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
    1323  
    1324      t is a sequence of ints.  This function can be useful to save space if
    1325      many of the ints are the same.  t1 and t2 are lists of ints, and shift
    1326      is an int, chosen to minimize the combined size of t1 and t2 (in C
    1327      code), and where for each i in range(len(t)),
    1328          t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
    1329      where mask is a bitmask isolating the last "shift" bits.
    1330  
    1331      If optional arg trace is non-zero (default zero), progress info
    1332      is printed to sys.stderr.  The higher the value, the more info
    1333      you'll get.
    1334      """
    1335  
    1336      if trace:
    1337          def dump(t1, t2, shift, bytes):
    1338              print("%d+%d bins at shift %d; %d bytes" % (
    1339                  len(t1), len(t2), shift, bytes), file=sys.stderr)
    1340          print("Size of original table:", len(t)*getsize(t), "bytes",
    1341                file=sys.stderr)
    1342      n = len(t)-1    # last valid index
    1343      maxshift = 0    # the most we can shift n and still have something left
    1344      if n > 0:
    1345          while n >> 1:
    1346              n >>= 1
    1347              maxshift += 1
    1348      del n
    1349      bytes = sys.maxsize  # smallest total size so far
    1350      t = tuple(t)    # so slices can be dict keys
    1351      for shift in range(maxshift + 1):
    1352          t1 = []
    1353          t2 = []
    1354          size = 2**shift
    1355          bincache = {}
    1356          for i in range(0, len(t), size):
    1357              bin = t[i:i+size]
    1358              index = bincache.get(bin)
    1359              if index is None:
    1360                  index = len(t2)
    1361                  bincache[bin] = index
    1362                  t2.extend(bin)
    1363              t1.append(index >> shift)
    1364          # determine memory size
    1365          b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
    1366          if trace > 1:
    1367              dump(t1, t2, shift, b)
    1368          if b < bytes:
    1369              best = t1, t2, shift
    1370              bytes = b
    1371      t1, t2, shift = best
    1372      if trace:
    1373          print("Best:", end=' ', file=sys.stderr)
    1374          dump(t1, t2, shift, bytes)
    1375      if __debug__:
    1376          # exhaustively verify that the decomposition is correct
    1377          mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
    1378          for i in range(len(t)):
    1379              assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
    1380      return best
    1381  
    1382  
    1383  if __name__ == "__main__":
    1384      maketables(1)