(root)/
Python-3.11.7/
Tools/
unicode/
genmap_schinese.py
       1  #
       2  # genmap_schinese.py: Simplified Chinese Codecs Map Generator
       3  #
       4  # Original Author:  Hye-Shik Chang <perky@FreeBSD.org>
       5  # Modified Author:  Dong-hee Na <donghee.na92@gmail.com>
       6  #
       7  import os
       8  import re
       9  
      10  from genmap_support import *
      11  
      12  
      13  GB2312_C1   = (0x21, 0x7e)
      14  GB2312_C2   = (0x21, 0x7e)
      15  GBKL1_C1    = (0x81, 0xa8)
      16  GBKL1_C2    = (0x40, 0xfe)
      17  GBKL2_C1    = (0xa9, 0xfe)
      18  GBKL2_C2    = (0x40, 0xa0)
      19  GB18030EXTP1_C1 = (0xa1, 0xa9)
      20  GB18030EXTP1_C2 = (0x40, 0xfe)
      21  GB18030EXTP2_C1 = (0xaa, 0xaf)
      22  GB18030EXTP2_C2 = (0xa1, 0xfe)
      23  GB18030EXTP3_C1 = (0xd7, 0xd7)
      24  GB18030EXTP3_C2 = (0xfa, 0xfe)
      25  GB18030EXTP4_C1 = (0xf8, 0xfd)
      26  GB18030EXTP4_C2 = (0xa1, 0xfe)
      27  GB18030EXTP5_C1 = (0xfe, 0xfe)
      28  GB18030EXTP5_C2 = (0x50, 0xfe)
      29  
      30  MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
      31  MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
      32  MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'
      33  
      34  re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')
      35  
      36  
      37  def parse_gb18030map(fo):
      38      m, gbuni = {}, {}
      39      for i in range(65536):
      40          if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
      41              gbuni[i] = None
      42      for uni, native in re_gb18030ass.findall(fo.read()):
      43          uni = eval('0x'+uni)
      44          native = [eval('0x'+u) for u in native.split()]
      45          if len(native) <= 2:
      46              del gbuni[uni]
      47          if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
      48              m.setdefault(native[0], {})
      49              m[native[0]][native[1]] = uni
      50      gbuni = [k for k in gbuni.keys()]
      51      gbuni.sort()
      52      return m, gbuni
      53  
      54  def main():
      55      print("Loading Mapping File...")
      56      gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
      57      cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
      58      gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)
      59  
      60      gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
      61      gbkdecmap = loadmap(cp936map)
      62      gb2312decmap = loadmap(gb2312map)
      63      difmap = {}
      64      for c1, m in gbkdecmap.items():
      65          for c2, code in m.items():
      66              del gb18030decmap[c1][c2]
      67              if not gb18030decmap[c1]:
      68                  del gb18030decmap[c1]
      69      for c1, m in gb2312decmap.items():
      70          for c2, code in m.items():
      71              gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
      72              if gbkdecmap[gbkc1][gbkc2] == code:
      73                  del gbkdecmap[gbkc1][gbkc2]
      74                  if not gbkdecmap[gbkc1]:
      75                      del gbkdecmap[gbkc1]
      76  
      77      gb2312_gbkencmap, gb18030encmap = {}, {}
      78      for c1, m in gbkdecmap.items():
      79          for c2, code in m.items():
      80              gb2312_gbkencmap.setdefault(code >> 8, {})
      81              gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
      82      for c1, m in gb2312decmap.items():
      83          for c2, code in m.items():
      84              gb2312_gbkencmap.setdefault(code >> 8, {})
      85              gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
      86      for c1, m in gb18030decmap.items():
      87          for c2, code in m.items():
      88              gb18030encmap.setdefault(code >> 8, {})
      89              gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2
      90  
      91      with open('mappings_cn.h', 'w') as fp:
      92          print_autogen(fp, os.path.basename(__file__))
      93  
      94          print("Generating GB2312 decode map...")
      95          writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
      96          writer.update_decode_map(GB2312_C1, GB2312_C2)
      97          writer.generate()
      98  
      99          print("Generating GBK decode map...")
     100          writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
     101          writer.update_decode_map(GBKL1_C1, GBKL1_C2)
     102          writer.update_decode_map(GBKL2_C1, GBKL2_C2)
     103          writer.generate()
     104  
     105          print("Generating GB2312 && GBK encode map...")
     106          writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
     107          writer.generate()
     108  
     109          print("Generating GB18030 extension decode map...")
     110          writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
     111          for i in range(1, 6):
     112              writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))
     113  
     114          writer.generate()
     115  
     116          print("Generating GB18030 extension encode map...")
     117          writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
     118          writer.generate()
     119  
     120          print("Generating GB18030 Unicode BMP Mapping Ranges...")
     121          ranges = [[-1, -1, -1]]
     122          gblinnum = 0
     123          fp.write("""
     124  static const struct _gb18030_to_unibmp_ranges {
     125      Py_UCS4   first, last;
     126      DBCHAR       base;
     127  } gb18030_to_unibmp_ranges[] = {
     128  """)
     129  
     130          for uni in gb18030unilinear:
     131              if uni == ranges[-1][1] + 1:
     132                  ranges[-1][1] = uni
     133              else:
     134                  ranges.append([uni, uni, gblinnum])
     135              gblinnum += 1
     136  
     137          filler = BufferedFiller()
     138          for first, last, base in ranges[1:]:
     139              filler.write('{', str(first), ',', str(last), ',', str(base), '},')
     140  
     141          filler.write('{', '0,', '0,', str(
     142              ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
     143          filler.printout(fp)
     144  
     145      print("Done!")
     146  
     147  
     148  if __name__ == '__main__':
     149      main()