1  #!/usr/bin/python3
       2  # -*- coding: utf-8 -*-
       3  # Copyright (C) 2014-2023 Free Software Foundation, Inc.
       4  # This file is part of the GNU C Library.
       5  #
       6  # The GNU C Library is free software; you can redistribute it and/or
       7  # modify it under the terms of the GNU Lesser General Public
       8  # License as published by the Free Software Foundation; either
       9  # version 2.1 of the License, or (at your option) any later version.
      10  #
      11  # The GNU C Library is distributed in the hope that it will be useful,
      12  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
      14  # Lesser General Public License for more details.
      15  #
      16  # You should have received a copy of the GNU Lesser General Public
      17  # License along with the GNU C Library; if not, see
      18  # <https://www.gnu.org/licenses/>.
      19  
      20  '''glibc/localedata/charmaps/UTF-8 file generator script
      21  
      22  This script generates a glibc/localedata/charmaps/UTF-8 file
      23  from Unicode data.
      24  
      25  Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
      26  
      27  It will output UTF-8 file
      28  '''
      29  
      30  import argparse
      31  import sys
      32  import re
      33  import unicode_utils
      34  
      35  # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
      36  # sections 3.11 and 4.4.
      37  
      38  JAMO_INITIAL_SHORT_NAME = (
      39      'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
      40      'C', 'K', 'T', 'P', 'H'
      41  )
      42  
      43  JAMO_MEDIAL_SHORT_NAME = (
      44      'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
      45      'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
      46  )
      47  
      48  JAMO_FINAL_SHORT_NAME = (
      49      '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
      50      'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
      51      'P', 'H'
      52  )
      53  
      54  def process_range(start, end, outfile, name):
      55      '''Writes a range of code points into the CHARMAP section of the
      56      output file
      57  
      58      '''
      59      if 'Hangul Syllable' in name:
      60          # from glibc/localedata/ChangeLog:
      61          #
      62          #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
      63          #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
      64          #  so they become printable and carry a width. Comment out surrogate
      65          #  ranges. Add a WIDTH table
      66          #
      67          # So we expand the Hangul Syllables here:
      68          for i in range(int(start, 16), int(end, 16)+1 ):
      69              index2, index3 = divmod(i - 0xaC00, 28)
      70              index1, index2 = divmod(index2, 21)
      71              hangul_syllable_name = 'HANGUL SYLLABLE ' \
      72                                     + JAMO_INITIAL_SHORT_NAME[index1] \
      73                                     + JAMO_MEDIAL_SHORT_NAME[index2] \
      74                                     + JAMO_FINAL_SHORT_NAME[index3]
      75              outfile.write('{:<11s} {:<12s} {:s}\n'.format(
      76                  unicode_utils.ucs_symbol(i), convert_to_hex(i),
      77                  hangul_syllable_name))
      78          return
      79      # UnicodeData.txt file has contains code point ranges like this:
      80      #
      81      # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
      82      # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
      83      #
      84      # The glibc UTF-8 file splits ranges like these into shorter
      85      # ranges of 64 code points each:
      86      #
      87      # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
      88      # …
      89      # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
      90      for i in range(int(start, 16), int(end, 16), 64 ):
      91          if i > (int(end, 16)-64):
      92              outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
      93                      unicode_utils.ucs_symbol(i),
      94                      unicode_utils.ucs_symbol(int(end,16)),
      95                      convert_to_hex(i),
      96                      name))
      97              break
      98          outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
      99                  unicode_utils.ucs_symbol(i),
     100                  unicode_utils.ucs_symbol(i+63),
     101                  convert_to_hex(i),
     102                  name))
     103  
     104  def process_charmap(flines, outfile):
     105      '''This function takes an array which contains *all* lines of
     106      of UnicodeData.txt and write lines to outfile as used in the
     107  
     108      CHARMAP
     109      …
     110      END CHARMAP
     111  
     112      section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
     113  
     114      Samples for input lines:
     115  
     116      0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
     117      3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
     118      4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
     119      D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
     120      DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
     121      100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
     122      10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
     123  
     124      Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
     125  
     126      <U0010>     /x10 DATA LINK ESCAPE
     127      <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
     128      %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
     129      %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
     130      <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
     131  
     132      '''
     133      fields_start = []
     134      for line in flines:
     135          fields = line.split(";")
     136           # Some characters have “<control>” as their name. We try to
     137           # use the “Unicode 1.0 Name” (10th field in
     138           # UnicodeData.txt) for them.
     139           #
     140           # The Characters U+0080, U+0081, U+0084 and U+0099 have
     141           # “<control>” as their name but do not even have aa
     142           # ”Unicode 1.0 Name”. We could write code to take their
     143           # alternate names from NameAliases.txt.
     144          if fields[1] == "<control>" and fields[10]:
     145              fields[1] = fields[10]
     146          # Handling code point ranges like:
     147          #
     148          # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
     149          # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
     150          if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
     151              fields_start = fields
     152              continue
     153          if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
     154              process_range(fields_start[0], fields[0],
     155                            outfile, fields[1][:-7]+'>')
     156              fields_start = []
     157              continue
     158          fields_start = []
     159          if 'Surrogate,' in fields[1]:
     160              # Comment out the surrogates in the UTF-8 file.
     161              # One could of course skip them completely but
     162              # the original UTF-8 file in glibc had them as
     163              # comments, so we keep these comment lines.
     164              outfile.write('%')
     165          outfile.write('{:<11s} {:<12s} {:s}\n'.format(
     166                  unicode_utils.ucs_symbol(int(fields[0], 16)),
     167                  convert_to_hex(int(fields[0], 16)),
     168                  fields[1]))
     169  
     170  def convert_to_hex(code_point):
     171      '''Converts a code point to a hexadecimal UTF-8 representation
     172      like /x**/x**/x**.'''
     173      # Getting UTF8 of Unicode characters.
     174      # In Python3, .encode('UTF-8') does not work for
     175      # surrogates. Therefore, we use this conversion table
     176      surrogates = {
     177          0xD800: '/xed/xa0/x80',
     178          0xDB7F: '/xed/xad/xbf',
     179          0xDB80: '/xed/xae/x80',
     180          0xDBFF: '/xed/xaf/xbf',
     181          0xDC00: '/xed/xb0/x80',
     182          0xDFFF: '/xed/xbf/xbf',
     183      }
     184      if code_point in surrogates:
     185          return surrogates[code_point]
     186      return ''.join([
     187          '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
     188      ])
     189  
     190  def write_header_charmap(outfile):
     191      '''Write the header on top of the CHARMAP section to the output file'''
     192      outfile.write("<code_set_name> UTF-8\n")
     193      outfile.write("<comment_char> %\n")
     194      outfile.write("<escape_char> /\n")
     195      outfile.write("<mb_cur_min> 1\n")
     196      outfile.write("<mb_cur_max> 6\n\n")
     197      outfile.write("% CHARMAP generated using utf8_gen.py\n")
     198      outfile.write("% alias ISO-10646/UTF-8\n")
     199      outfile.write("CHARMAP\n")
     200  
     201  def write_header_width(outfile, unicode_version):
     202      '''Writes the header on top of the WIDTH section to the output file'''
     203      outfile.write('% Character width according to Unicode '
     204                    + '{:s}.\n'.format(unicode_version))
     205      outfile.write('% - Default width is 1.\n')
     206      outfile.write('% - Double-width characters have width 2; generated from\n')
     207      outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
     208      outfile.write('% - Non-spacing characters have width 0; '
     209                    + 'generated from PropList.txt or\n')
     210      outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
     211                    + 'UnicodeData.txt"\n')
     212      outfile.write('% - Format control characters have width 0; '
     213                    + 'generated from\n')
     214      outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
     215  #   Not needed covered by Cf
     216  #    outfile.write("% - Zero width characters have width 0; generated from\n")
     217  #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
     218      outfile.write("WIDTH\n")
     219  
     220  def process_width(outfile, ulines, elines, plines):
     221      '''ulines are lines from UnicodeData.txt, elines are lines from
     222      EastAsianWidth.txt containing characters with width “W” or “F”,
     223      plines are lines from PropList.txt which contain characters
     224      with the property “Prepended_Concatenation_Mark”.
     225  
     226      '''
     227      width_dict = {}
     228      for line in elines:
     229          fields = line.split(";")
     230          if not '..' in fields[0]:
     231              code_points = (fields[0], fields[0])
     232          else:
     233              code_points = fields[0].split("..")
     234          for key in range(int(code_points[0], 16),
     235                           int(code_points[1], 16)+1):
     236              width_dict[key] = 2
     237  
     238      for line in ulines:
     239          fields = line.split(";")
     240          if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
     241              width_dict[int(fields[0], 16)] = 0
     242  
     243      for line in plines:
     244          # Characters with the property “Prepended_Concatenation_Mark”
     245          # should have the width 1:
     246          fields = line.split(";")
     247          if not '..' in fields[0]:
     248              code_points = (fields[0], fields[0])
     249          else:
     250              code_points = fields[0].split("..")
     251          for key in range(int(code_points[0], 16),
     252                           int(code_points[1], 16)+1):
     253              del width_dict[key] # default width is 1
     254  
     255      # handle special cases for compatibility
     256      for key in list((0x00AD,)):
     257          # https://www.cs.tut.fi/~jkorpela/shy.html
     258          if key in width_dict:
     259              del width_dict[key] # default width is 1
     260      for key in list(range(0x1160, 0x1200)):
     261          # Hangul jungseong and jongseong:
     262          if key in unicode_utils.UNICODE_ATTRIBUTES:
     263              width_dict[key] = 0
     264      for key in list(range(0xD7B0, 0xD800)):
     265          # Hangul jungseong and jongseong:
     266          if key in unicode_utils.UNICODE_ATTRIBUTES:
     267              width_dict[key] = 0
     268      for key in list(range(0x3248, 0x3250)):
     269          # These are “A” which means we can decide whether to treat them
     270          # as “W” or “N” based on context:
     271          # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
     272          # For us, “W” seems better.
     273          width_dict[key] = 2
     274      for key in list(range(0x4DC0, 0x4E00)):
     275          width_dict[key] = 2
     276  
     277      same_width_lists = []
     278      current_width_list = []
     279      for key in sorted(width_dict):
     280          if not current_width_list:
     281              current_width_list = [key]
     282          elif (key == current_width_list[-1] + 1
     283                and width_dict[key] == width_dict[current_width_list[0]]):
     284              current_width_list.append(key)
     285          else:
     286              same_width_lists.append(current_width_list)
     287              current_width_list = [key]
     288      if current_width_list:
     289          same_width_lists.append(current_width_list)
     290  
     291      for same_width_list in same_width_lists:
     292          if len(same_width_list) == 1:
     293              outfile.write('{:s}\t{:d}\n'.format(
     294                  unicode_utils.ucs_symbol(same_width_list[0]),
     295                  width_dict[same_width_list[0]]))
     296          else:
     297              outfile.write('{:s}...{:s}\t{:d}\n'.format(
     298                  unicode_utils.ucs_symbol(same_width_list[0]),
     299                  unicode_utils.ucs_symbol(same_width_list[-1]),
     300                  width_dict[same_width_list[0]]))
     301  
     302  if __name__ == "__main__":
     303      PARSER = argparse.ArgumentParser(
     304          description='''
     305          Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
     306          ''')
     307      PARSER.add_argument(
     308          '-u', '--unicode_data_file',
     309          nargs='?',
     310          type=str,
     311          default='UnicodeData.txt',
     312          help=('The UnicodeData.txt file to read, '
     313                + 'default: %(default)s'))
     314      PARSER.add_argument(
     315          '-e', '--east_asian_with_file',
     316          nargs='?',
     317          type=str,
     318          default='EastAsianWidth.txt',
     319          help=('The EastAsianWidth.txt file to read, '
     320                + 'default: %(default)s'))
     321      PARSER.add_argument(
     322          '-p', '--prop_list_file',
     323          nargs='?',
     324          type=str,
     325          default='PropList.txt',
     326          help=('The PropList.txt file to read, '
     327                + 'default: %(default)s'))
     328      PARSER.add_argument(
     329          '--unicode_version',
     330          nargs='?',
     331          required=True,
     332          type=str,
     333          help='The Unicode version of the input files used.')
     334      ARGS = PARSER.parse_args()
     335  
     336      unicode_utils.fill_attributes(ARGS.unicode_data_file)
     337      with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
     338          UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
     339      with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
     340          EAST_ASIAN_WIDTH_LINES = []
     341          for LINE in EAST_ASIAN_WIDTH_FILE:
     342              # If characters from EastAasianWidth.txt which are from
     343              # from reserved ranges (i.e. not yet assigned code points)
     344              # are added to the WIDTH section of the UTF-8 file, then
     345              # “make check” produces “Unknown Character” errors for
     346              # these code points because such unassigned code points
     347              # are not in the CHARMAP section of the UTF-8 file.
     348              #
     349              # Therefore, we skip all reserved code points when reading
     350              # the EastAsianWidth.txt file.
     351              if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
     352                  continue
     353              if re.match(r'^[^;]*;[WF]', LINE):
     354                  EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
     355      with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
     356          PROP_LIST_LINES = []
     357          for LINE in PROP_LIST_FILE:
     358              if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
     359                  PROP_LIST_LINES.append(LINE.strip())
     360      with open('UTF-8', mode='w') as OUTFILE:
     361          # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
     362          write_header_charmap(OUTFILE)
     363          process_charmap(UNICODE_DATA_LINES, OUTFILE)
     364          OUTFILE.write("END CHARMAP\n\n")
     365          # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
     366          write_header_width(OUTFILE, ARGS.unicode_version)
     367          process_width(OUTFILE,
     368                        UNICODE_DATA_LINES,
     369                        EAST_ASIAN_WIDTH_LINES,
     370                        PROP_LIST_LINES)
     371          OUTFILE.write("END WIDTH\n")