(root)/
Python-3.12.0/
Tools/
unicode/
gencodec.py
       1  """ Unicode Mapping Parser and Codec Generator.
       2  
       3  This script parses Unicode mapping files as available from the Unicode
       4  site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
       5  modules from them. The codecs use the standard character mapping codec
       6  to actually apply the mapping.
       7  
       8  Synopsis: gencodec.py dir codec_prefix
       9  
      10  All files in dir are scanned and those producing non-empty mappings
      11  will be written to <codec_prefix><mapname>.py with <mapname> being the
      12  first part of the map's filename ('a' in a.b.c.txt) converted to
      13  lowercase with hyphens replaced by underscores.
      14  
      15  The tool also writes marshalled versions of the mapping tables to the
      16  same location (with .mapping extension).
      17  
      18  Written by Marc-Andre Lemburg (mal@lemburg.com).
      19  
      20  (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
      21  (c) Copyright Guido van Rossum, 2000.
      22  
      23  Table generation:
      24  (c) Copyright Marc-Andre Lemburg, 2005.
      25      Licensed to PSF under a Contributor Agreement.
      26  
      27  """#"
      28  
      29  import re, os, marshal, codecs
      30  
      31  # Maximum allowed size of charmap tables
      32  MAX_TABLE_SIZE = 8192
      33  
      34  # Standard undefined Unicode code point
      35  UNI_UNDEFINED = chr(0xFFFE)
      36  
      37  # Placeholder for a missing code point
      38  MISSING_CODE = -1
      39  
      40  mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
      41                     r'\s+'
      42                     r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
      43                     r'\s*'
      44                     r'(#.+)?')
      45  
      46  def parsecodes(codes, len=len, range=range):
      47  
      48      """ Converts code combinations to either a single code integer
      49          or a tuple of integers.
      50  
      51          meta-codes (in angular brackets, e.g. <LR> and <RL>) are
      52          ignored.
      53  
      54          Empty codes or illegal ones are returned as None.
      55  
      56      """
      57      if not codes:
      58          return MISSING_CODE
      59      l = codes.split('+')
      60      if len(l) == 1:
      61          return int(l[0],16)
      62      for i in range(len(l)):
      63          try:
      64              l[i] = int(l[i],16)
      65          except ValueError:
      66              l[i] = MISSING_CODE
      67      l = [x for x in l if x != MISSING_CODE]
      68      if len(l) == 1:
      69          return l[0]
      70      else:
      71          return tuple(l)
      72  
      73  def readmap(filename):
      74  
      75      with open(filename) as f:
      76          lines = f.readlines()
      77      enc2uni = {}
      78      identity = []
      79      unmapped = list(range(256))
      80  
      81      # UTC mapping tables per convention don't include the identity
      82      # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
      83      # explicitly mapped to different characters or undefined
      84      for i in list(range(32)) + [127]:
      85          identity.append(i)
      86          unmapped.remove(i)
      87          enc2uni[i] = (i, 'CONTROL CHARACTER')
      88  
      89      for line in lines:
      90          line = line.strip()
      91          if not line or line[0] == '#':
      92              continue
      93          m = mapRE.match(line)
      94          if not m:
      95              #print '* not matched: %s' % repr(line)
      96              continue
      97          enc,uni,comment = m.groups()
      98          enc = parsecodes(enc)
      99          uni = parsecodes(uni)
     100          if comment is None:
     101              comment = ''
     102          else:
     103              comment = comment[1:].strip()
     104          if not isinstance(enc, tuple) and enc < 256:
     105              if enc in unmapped:
     106                  unmapped.remove(enc)
     107              if enc == uni:
     108                  identity.append(enc)
     109              enc2uni[enc] = (uni,comment)
     110          else:
     111              enc2uni[enc] = (uni,comment)
     112  
     113      # If there are more identity-mapped entries than unmapped entries,
     114      # it pays to generate an identity dictionary first, and add explicit
     115      # mappings to None for the rest
     116      if len(identity) >= len(unmapped):
     117          for enc in unmapped:
     118              enc2uni[enc] = (MISSING_CODE, "")
     119          enc2uni['IDENTITY'] = 256
     120  
     121      return enc2uni
     122  
     123  def hexrepr(t, precision=4):
     124  
     125      if t is None:
     126          return 'None'
     127      try:
     128          len(t)
     129      except TypeError:
     130          return '0x%0*X' % (precision, t)
     131      try:
     132          return '(' + ', '.join(['0x%0*X' % (precision, item)
     133                                  for item in t]) + ')'
     134      except TypeError as why:
     135          print('* failed to convert %r: %s' % (t, why))
     136          raise
     137  
     138  def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
     139  
     140      l = []
     141      append = l.append
     142      if "IDENTITY" in map:
     143          append("%s = codecs.make_identity_dict(range(%d))" %
     144                 (varname, map["IDENTITY"]))
     145          append("%s.update({" % varname)
     146          splits = 1
     147          del map["IDENTITY"]
     148          identity = 1
     149      else:
     150          append("%s = {" % varname)
     151          splits = 0
     152          identity = 0
     153  
     154      mappings = sorted(map.items())
     155      i = 0
     156      key_precision, value_precision = precisions
     157      for mapkey, mapvalue in mappings:
     158          mapcomment = ''
     159          if isinstance(mapkey, tuple):
     160              (mapkey, mapcomment) = mapkey
     161          if isinstance(mapvalue, tuple):
     162              (mapvalue, mapcomment) = mapvalue
     163          if mapkey is None:
     164              continue
     165          if (identity and
     166              mapkey == mapvalue and
     167              mapkey < 256):
     168              # No need to include identity mappings, since these
     169              # are already set for the first 256 code points.
     170              continue
     171          key = hexrepr(mapkey, key_precision)
     172          value = hexrepr(mapvalue, value_precision)
     173          if mapcomment and comments:
     174              append('    %s: %s,\t#  %s' % (key, value, mapcomment))
     175          else:
     176              append('    %s: %s,' % (key, value))
     177          i += 1
     178          if i == 4096:
     179              # Split the definition into parts to that the Python
     180              # parser doesn't dump core
     181              if splits == 0:
     182                  append('}')
     183              else:
     184                  append('})')
     185              append('%s.update({' % varname)
     186              i = 0
     187              splits = splits + 1
     188      if splits == 0:
     189          append('}')
     190      else:
     191          append('})')
     192  
     193      return l
     194  
     195  def python_tabledef_code(varname, map, comments=1, key_precision=2):
     196  
     197      l = []
     198      append = l.append
     199      append('%s = (' % varname)
     200  
     201      # Analyze map and create table dict
     202      mappings = sorted(map.items())
     203      table = {}
     204      maxkey = 255
     205      if 'IDENTITY' in map:
     206          for key in range(256):
     207              table[key] = (key, '')
     208          del map['IDENTITY']
     209      for mapkey, mapvalue in mappings:
     210          mapcomment = ''
     211          if isinstance(mapkey, tuple):
     212              (mapkey, mapcomment) = mapkey
     213          if isinstance(mapvalue, tuple):
     214              (mapvalue, mapcomment) = mapvalue
     215          if mapkey == MISSING_CODE:
     216              continue
     217          table[mapkey] = (mapvalue, mapcomment)
     218          if mapkey > maxkey:
     219              maxkey = mapkey
     220      if maxkey > MAX_TABLE_SIZE:
     221          # Table too large
     222          return None
     223  
     224      # Create table code
     225      maxchar = 0
     226      for key in range(maxkey + 1):
     227          if key not in table:
     228              mapvalue = MISSING_CODE
     229              mapcomment = 'UNDEFINED'
     230          else:
     231              mapvalue, mapcomment = table[key]
     232          if mapvalue == MISSING_CODE:
     233              mapchar = UNI_UNDEFINED
     234          else:
     235              if isinstance(mapvalue, tuple):
     236                  # 1-n mappings not supported
     237                  return None
     238              else:
     239                  mapchar = chr(mapvalue)
     240          maxchar = max(maxchar, ord(mapchar))
     241          if mapcomment and comments:
     242              append('    %a \t#  %s -> %s' % (mapchar,
     243                                              hexrepr(key, key_precision),
     244                                              mapcomment))
     245          else:
     246              append('    %a' % mapchar)
     247  
     248      if maxchar < 256:
     249          append('    %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
     250      append(')')
     251      return l
     252  
     253  def codegen(name, map, encodingname, comments=1):
     254  
     255      """ Returns Python source for the given map.
     256  
     257          Comments are included in the source, if comments is true (default).
     258  
     259      """
     260      # Generate code
     261      decoding_map_code = python_mapdef_code(
     262          'decoding_map',
     263          map,
     264          comments=comments)
     265      decoding_table_code = python_tabledef_code(
     266          'decoding_table',
     267          map,
     268          comments=comments)
     269      encoding_map_code = python_mapdef_code(
     270          'encoding_map',
     271          codecs.make_encoding_map(map),
     272          comments=comments,
     273          precisions=(4, 2))
     274  
     275      if decoding_table_code:
     276          suffix = 'table'
     277      else:
     278          suffix = 'map'
     279  
     280      l = [
     281          '''\
     282  """ Python Character Mapping Codec %s generated from '%s' with gencodec.py.
     283  
     284  """#"
     285  
     286  import codecs
     287  
     288  ### Codec APIs
     289  
     290  class Codec(codecs.Codec):
     291  
     292      def encode(self, input, errors='strict'):
     293          return codecs.charmap_encode(input, errors, encoding_%s)
     294  
     295      def decode(self, input, errors='strict'):
     296          return codecs.charmap_decode(input, errors, decoding_%s)
     297  ''' % (encodingname, name, suffix, suffix)]
     298      l.append('''\
     299  class IncrementalEncoder(codecs.IncrementalEncoder):
     300      def encode(self, input, final=False):
     301          return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
     302  
     303  class IncrementalDecoder(codecs.IncrementalDecoder):
     304      def decode(self, input, final=False):
     305          return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
     306          (suffix, suffix))
     307  
     308      l.append('''
     309  class StreamWriter(Codec, codecs.StreamWriter):
     310      pass
     311  
     312  class StreamReader(Codec, codecs.StreamReader):
     313      pass
     314  
     315  ### encodings module API
     316  
     317  def getregentry():
     318      return codecs.CodecInfo(
     319          name=%r,
     320          encode=Codec().encode,
     321          decode=Codec().decode,
     322          incrementalencoder=IncrementalEncoder,
     323          incrementaldecoder=IncrementalDecoder,
     324          streamreader=StreamReader,
     325          streamwriter=StreamWriter,
     326      )
     327  ''' % encodingname.replace('_', '-'))
     328  
     329      # Add decoding table or map (with preference to the table)
     330      if not decoding_table_code:
     331          l.append('''
     332  ### Decoding Map
     333  ''')
     334          l.extend(decoding_map_code)
     335      else:
     336          l.append('''
     337  ### Decoding Table
     338  ''')
     339          l.extend(decoding_table_code)
     340  
     341      # Add encoding map
     342      if decoding_table_code:
     343          l.append('''
     344  ### Encoding table
     345  encoding_table = codecs.charmap_build(decoding_table)
     346  ''')
     347      else:
     348          l.append('''
     349  ### Encoding Map
     350  ''')
     351          l.extend(encoding_map_code)
     352  
     353      # Final new-line
     354      l.append('')
     355  
     356      return '\n'.join(l).expandtabs()
     357  
     358  def pymap(name,map,pyfile,encodingname,comments=1):
     359  
     360      code = codegen(name,map,encodingname,comments)
     361      with open(pyfile,'w') as f:
     362          f.write(code)
     363  
     364  def marshalmap(name,map,marshalfile):
     365  
     366      d = {}
     367      for e,(u,c) in map.items():
     368          d[e] = (u,c)
     369      with open(marshalfile,'wb') as f:
     370          marshal.dump(d,f)
     371  
     372  def convertdir(dir, dirprefix='', nameprefix='', comments=1):
     373  
     374      mapnames = os.listdir(dir)
     375      for mapname in mapnames:
     376          mappathname = os.path.join(dir, mapname)
     377          if not os.path.isfile(mappathname):
     378              continue
     379          name = os.path.split(mapname)[1]
     380          name = name.replace('-','_')
     381          name = name.split('.')[0]
     382          name = name.lower()
     383          name = nameprefix + name
     384          codefile = name + '.py'
     385          marshalfile = name + '.mapping'
     386          print('converting %s to %s and %s' % (mapname,
     387                                                dirprefix + codefile,
     388                                                dirprefix + marshalfile))
     389          try:
     390              map = readmap(os.path.join(dir,mapname))
     391              if not map:
     392                  print('* map is empty; skipping')
     393              else:
     394                  pymap(mappathname, map, dirprefix + codefile,name,comments)
     395                  marshalmap(mappathname, map, dirprefix + marshalfile)
     396          except ValueError as why:
     397              print('* conversion failed: %s' % why)
     398              raise
     399  
     400  def rewritepythondir(dir, dirprefix='', comments=1):
     401  
     402      mapnames = os.listdir(dir)
     403      for mapname in mapnames:
     404          if not mapname.endswith('.mapping'):
     405              continue
     406          name = mapname[:-len('.mapping')]
     407          codefile = name + '.py'
     408          print('converting %s to %s' % (mapname,
     409                                         dirprefix + codefile))
     410          try:
     411              with open(os.path.join(dir, mapname), 'rb') as f:
     412                  map = marshal.load(f)
     413              if not map:
     414                  print('* map is empty; skipping')
     415              else:
     416                  pymap(mapname, map, dirprefix + codefile,name,comments)
     417          except ValueError as why:
     418              print('* conversion failed: %s' % why)
     419  
     420  if __name__ == '__main__':
     421  
     422      import sys
     423      if 1:
     424          convertdir(*sys.argv[1:])
     425      else:
     426          rewritepythondir(*sys.argv[1:])