1  import re
       2  from unicodedata import ucd_3_2_0 as unicodedata
       3  
       4  def gen_category(cats):
       5      for i in range(0, 0x110000):
       6          if unicodedata.category(chr(i)) in cats:
       7              yield(i)
       8  
       9  def gen_bidirectional(cats):
      10      for i in range(0, 0x110000):
      11          if unicodedata.bidirectional(chr(i)) in cats:
      12              yield(i)
      13  
      14  def compact_set(l):
      15      single = []
      16      tuple = []
      17      prev = None
      18      span = 0
      19      for e in l:
      20          if prev is None:
      21              prev = e
      22              span = 0
      23              continue
      24          if prev+span+1 != e:
      25              if span > 2:
      26                  tuple.append((prev,prev+span+1))
      27              else:
      28                  for i in range(prev, prev+span+1):
      29                      single.append(i)
      30              prev = e
      31              span = 0
      32          else:
      33              span += 1
      34      if span:
      35          tuple.append((prev,prev+span+1))
      36      else:
      37          single.append(prev)
      38      if not single and len(tuple) == 1:
      39          tuple = "range(%d,%d)" % tuple[0]
      40      else:
      41          tuple = " + ".join("list(range(%d,%d))" % t for t in tuple)
      42      if not single:
      43          return "set(%s)" % tuple
      44      if not tuple:
      45          return "set(%r)" % (single,)
      46      return "set(%r + %s)" % (single, tuple)
      47  
      48  ############## Read the tables in the RFC #######################
      49  
      50  with open("rfc3454.txt") as f:
      51      data = f.readlines()
      52  
      53  tables = []
      54  curname = None
      55  for l in data:
      56      l = l.strip()
      57      if not l:
      58          continue
      59      # Skip RFC page breaks
      60      if l.startswith(("Hoffman & Blanchet", "RFC 3454")):
      61          continue
      62      # Find start/end lines
      63      m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
      64      if m:
      65          if m.group(1) == "Start":
      66              if curname:
      67                  raise RuntimeError("Double Start", (curname, l))
      68              curname = m.group(2)
      69              table = {}
      70              tables.append((curname, table))
      71              continue
      72          else:
      73              if not curname:
      74                  raise RuntimeError("End without start", l)
      75              if curname != m.group(2):
      76                  raise RuntimeError("Unexpected end", l)
      77              curname = None
      78              continue
      79      if not curname:
      80          continue
      81      # Now we are in a table
      82      fields = l.split(";")
      83      if len(fields) > 1:
      84          # Drop comment field
      85          fields = fields[:-1]
      86      if len(fields) == 1:
      87          fields = fields[0].split("-")
      88          if len(fields) > 1:
      89              # range
      90              try:
      91                  start, end = fields
      92              except ValueError:
      93                  raise RuntimeError("Unpacking problem", l)
      94          else:
      95              start = end = fields[0]
      96          start = int(start, 16)
      97          end = int(end, 16)
      98          for i in range(start, end+1):
      99              table[i] = i
     100      else:
     101          code, value = fields
     102          value = value.strip()
     103          if value:
     104              value = [int(v, 16) for v in value.split(" ")]
     105          else:
     106              # table B.1
     107              value = None
     108          table[int(code, 16)] = value
     109  
     110  ########### Generate compact Python versions of the tables #############
     111  
     112  print("""# This file is generated by mkstringprep.py. DO NOT EDIT.
     113  \"\"\"Library that exposes various tables found in the StringPrep RFC 3454.
     114  
     115  There are two kinds of tables: sets, for which a member test is provided,
     116  and mappings, for which a mapping function is provided.
     117  \"\"\"
     118  
     119  from unicodedata import ucd_3_2_0 as unicodedata
     120  """)
     121  
     122  print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,))
     123  
     124  # A.1 is the table of unassigned characters
     125  # XXX Plane 15 PUA is listed as unassigned in Python.
     126  name, table = tables[0]
     127  del tables[0]
     128  assert name == "A.1"
     129  table = set(table.keys())
     130  Cn = set(gen_category(["Cn"]))
     131  
     132  # FDD0..FDEF are process internal codes
     133  Cn -= set(range(0xFDD0, 0xFDF0))
     134  # not a character
     135  Cn -= set(range(0xFFFE, 0x110000, 0x10000))
     136  Cn -= set(range(0xFFFF, 0x110000, 0x10000))
     137  
     138  # assert table == Cn
     139  
     140  print("""
     141  def in_table_a1(code):
     142      if unicodedata.category(code) != 'Cn': return False
     143      c = ord(code)
     144      if 0xFDD0 <= c < 0xFDF0: return False
     145      return (c & 0xFFFF) not in (0xFFFE, 0xFFFF)
     146  """)
     147  
     148  # B.1 cannot easily be derived
     149  name, table = tables[0]
     150  del tables[0]
     151  assert name == "B.1"
     152  table = sorted(table.keys())
     153  print("""
     154  b1_set = """ + compact_set(table) + """
     155  def in_table_b1(code):
     156      return ord(code) in b1_set
     157  """)
     158  
     159  # B.2 and B.3 is case folding.
     160  # It takes CaseFolding.txt into account, which is
     161  # not available in the Python database. Since
     162  # B.2 is derived from B.3, we process B.3 first.
     163  # B.3 supposedly *is* CaseFolding-3.2.0.txt.
     164  
     165  name, table_b2 = tables[0]
     166  del tables[0]
     167  assert name == "B.2"
     168  
     169  name, table_b3 = tables[0]
     170  del tables[0]
     171  assert name == "B.3"
     172  
     173  # B.3 is mostly Python's .lower, except for a number
     174  # of special cases, e.g. considering canonical forms.
     175  
     176  b3_exceptions = {}
     177  
     178  for k,v in table_b2.items():
     179      if list(map(ord, chr(k).lower())) != v:
     180          b3_exceptions[k] = "".join(map(chr,v))
     181  
     182  b3 = sorted(b3_exceptions.items())
     183  
     184  print("""
     185  b3_exceptions = {""")
     186  for i, kv in enumerate(b3):
     187      print("0x%x:%a," % kv, end=' ')
     188      if i % 4 == 3:
     189          print()
     190  print("}")
     191  
     192  print("""
     193  def map_table_b3(code):
     194      r = b3_exceptions.get(ord(code))
     195      if r is not None: return r
     196      return code.lower()
     197  """)
     198  
     199  def map_table_b3(code):
     200      r = b3_exceptions.get(ord(code))
     201      if r is not None: return r
     202      return code.lower()
     203  
     204  # B.2 is case folding for NFKC. This is the same as B.3,
     205  # except where NormalizeWithKC(Fold(a)) !=
     206  # NormalizeWithKC(Fold(NormalizeWithKC(Fold(a))))
     207  
     208  def map_table_b2(a):
     209      al = map_table_b3(a)
     210      b = unicodedata.normalize("NFKC", al)
     211      bl = "".join([map_table_b3(ch) for ch in b])
     212      c = unicodedata.normalize("NFKC", bl)
     213      if b != c:
     214          return c
     215      else:
     216          return al
     217  
     218  specials = {}
     219  for k,v in table_b2.items():
     220      if list(map(ord, map_table_b2(chr(k)))) != v:
     221          specials[k] = v
     222  
     223  # B.3 should not add any additional special cases
     224  assert specials == {}
     225  
     226  print("""
     227  def map_table_b2(a):
     228      al = map_table_b3(a)
     229      b = unicodedata.normalize("NFKC", al)
     230      bl = "".join([map_table_b3(ch) for ch in b])
     231      c = unicodedata.normalize("NFKC", bl)
     232      if b != c:
     233          return c
     234      else:
     235          return al
     236  """)
     237  
     238  # C.1.1 is a table with a single character
     239  name, table = tables[0]
     240  del tables[0]
     241  assert name == "C.1.1"
     242  assert table == {0x20:0x20}
     243  
     244  print("""
     245  def in_table_c11(code):
     246      return code == " "
     247  """)
     248  
     249  # C.1.2 is the rest of all space characters
     250  name, table = tables[0]
     251  del tables[0]
     252  assert name == "C.1.2"
     253  
     254  # table = set(table.keys())
     255  # Zs = set(gen_category(["Zs"])) - {0x20}
     256  # assert Zs == table
     257  
     258  print("""
     259  def in_table_c12(code):
     260      return unicodedata.category(code) == "Zs" and code != " "
     261  
     262  def in_table_c11_c12(code):
     263      return unicodedata.category(code) == "Zs"
     264  """)
     265  
     266  # C.2.1 ASCII control characters
     267  name, table_c21 = tables[0]
     268  del tables[0]
     269  assert name == "C.2.1"
     270  
     271  Cc = set(gen_category(["Cc"]))
     272  Cc_ascii = Cc & set(range(128))
     273  table_c21 = set(table_c21.keys())
     274  assert Cc_ascii == table_c21
     275  
     276  print("""
     277  def in_table_c21(code):
     278      return ord(code) < 128 and unicodedata.category(code) == "Cc"
     279  """)
     280  
     281  # C.2.2 Non-ASCII control characters. It also includes
     282  # a number of characters in category Cf.
     283  name, table_c22 = tables[0]
     284  del tables[0]
     285  assert name == "C.2.2"
     286  
     287  Cc_nonascii = Cc - Cc_ascii
     288  table_c22 = set(table_c22.keys())
     289  assert len(Cc_nonascii - table_c22) == 0
     290  
     291  specials = list(table_c22 - Cc_nonascii)
     292  specials.sort()
     293  
     294  print("""c22_specials = """ + compact_set(specials) + """
     295  def in_table_c22(code):
     296      c = ord(code)
     297      if c < 128: return False
     298      if unicodedata.category(code) == "Cc": return True
     299      return c in c22_specials
     300  
     301  def in_table_c21_c22(code):
     302      return unicodedata.category(code) == "Cc" or \\
     303             ord(code) in c22_specials
     304  """)
     305  
     306  # C.3 Private use
     307  name, table = tables[0]
     308  del tables[0]
     309  assert name == "C.3"
     310  
     311  Co = set(gen_category(["Co"]))
     312  assert set(table.keys()) == Co
     313  
     314  print("""
     315  def in_table_c3(code):
     316      return unicodedata.category(code) == "Co"
     317  """)
     318  
     319  # C.4 Non-character code points, xFFFE, xFFFF
     320  # plus process internal codes
     321  name, table = tables[0]
     322  del tables[0]
     323  assert name == "C.4"
     324  
     325  nonchar = set(range(0xFDD0,0xFDF0))
     326  nonchar.update(range(0xFFFE,0x110000,0x10000))
     327  nonchar.update(range(0xFFFF,0x110000,0x10000))
     328  table = set(table.keys())
     329  assert table == nonchar
     330  
     331  print("""
     332  def in_table_c4(code):
     333      c = ord(code)
     334      if c < 0xFDD0: return False
     335      if c < 0xFDF0: return True
     336      return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF)
     337  """)
     338  
     339  # C.5 Surrogate codes
     340  name, table = tables[0]
     341  del tables[0]
     342  assert name == "C.5"
     343  
     344  Cs = set(gen_category(["Cs"]))
     345  assert set(table.keys()) == Cs
     346  
     347  print("""
     348  def in_table_c5(code):
     349      return unicodedata.category(code) == "Cs"
     350  """)
     351  
     352  # C.6 Inappropriate for plain text
     353  name, table = tables[0]
     354  del tables[0]
     355  assert name == "C.6"
     356  
     357  table = sorted(table.keys())
     358  
     359  print("""
     360  c6_set = """ + compact_set(table) + """
     361  def in_table_c6(code):
     362      return ord(code) in c6_set
     363  """)
     364  
     365  # C.7 Inappropriate for canonical representation
     366  name, table = tables[0]
     367  del tables[0]
     368  assert name == "C.7"
     369  
     370  table = sorted(table.keys())
     371  
     372  print("""
     373  c7_set = """ + compact_set(table) + """
     374  def in_table_c7(code):
     375      return ord(code) in c7_set
     376  """)
     377  
     378  # C.8 Change display properties or are deprecated
     379  name, table = tables[0]
     380  del tables[0]
     381  assert name == "C.8"
     382  
     383  table = sorted(table.keys())
     384  
     385  print("""
     386  c8_set = """ + compact_set(table) + """
     387  def in_table_c8(code):
     388      return ord(code) in c8_set
     389  """)
     390  
     391  # C.9 Tagging characters
     392  name, table = tables[0]
     393  del tables[0]
     394  assert name == "C.9"
     395  
     396  table = sorted(table.keys())
     397  
     398  print("""
     399  c9_set = """ + compact_set(table) + """
     400  def in_table_c9(code):
     401      return ord(code) in c9_set
     402  """)
     403  
     404  # D.1 Characters with bidirectional property "R" or "AL"
     405  name, table = tables[0]
     406  del tables[0]
     407  assert name == "D.1"
     408  
     409  RandAL = set(gen_bidirectional(["R","AL"]))
     410  assert set(table.keys()) == RandAL
     411  
     412  print("""
     413  def in_table_d1(code):
     414      return unicodedata.bidirectional(code) in ("R","AL")
     415  """)
     416  
     417  # D.2 Characters with bidirectional property "L"
     418  name, table = tables[0]
     419  del tables[0]
     420  assert name == "D.2"
     421  
     422  L = set(gen_bidirectional(["L"]))
     423  assert set(table.keys()) == L
     424  
     425  print("""
     426  def in_table_d2(code):
     427      return unicodedata.bidirectional(code) == "L"
     428  """)