1  #!/usr/bin/python3
       2  # -*- coding: utf-8 -*-
       3  #
       4  # Generate a translit_cjk_compat file from a UnicodeData file.
       5  # Copyright (C) 2015-2023 Free Software Foundation, Inc.
       6  # This file is part of the GNU C Library.
       7  #
       8  # The GNU C Library is free software; you can redistribute it and/or
       9  # modify it under the terms of the GNU Lesser General Public
      10  # License as published by the Free Software Foundation; either
      11  # version 2.1 of the License, or (at your option) any later version.
      12  #
      13  # The GNU C Library is distributed in the hope that it will be useful,
      14  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16  # Lesser General Public License for more details.
      17  #
      18  # You should have received a copy of the GNU Lesser General Public
      19  # License along with the GNU C Library; if not, see
      20  # <https://www.gnu.org/licenses/>.
      21  
      22  '''
      23  Generate a translit_cjk_compat file from UnicodeData.txt
      24  
      25  To see how this script is used, call it with the “-h” option:
      26  
      27      $ ./gen_translit_cjk_compat -h
      28      … prints usage message …
      29  '''
      30  
      31  import argparse
      32  import time
      33  import sys
      34  import unicode_utils
      35  
      36  def read_input_file(filename):
      37      '''Reads the original glibc translit_cjk_compat file to get the
      38      original head and tail.
      39  
      40      We want to replace only the part of the file between
      41      “translit_start” and “translit_end”
      42      '''
      43      head = tail = ''
      44      with open(filename, mode='r') as translit_file:
      45          for line in translit_file:
      46              head = head + line
      47              if line.startswith('translit_start'):
      48                  break
      49          for line in translit_file:
      50              if line.startswith('translit_end'):
      51                  tail = line
      52                  break
      53          for line in translit_file:
      54              tail = tail + line
      55      return (head, tail)
      56  
      57  def output_head(translit_file, unicode_version, head=''):
      58      '''Write the header of the output file, i.e. the part of the file
      59      before the “translit_start” line.
      60      '''
      61      if ARGS.input_file and head:
      62          translit_file.write(head)
      63      else:
      64          translit_file.write('escape_char /\n')
      65          translit_file.write('comment_char %\n')
      66          translit_file.write(unicode_utils.COMMENT_HEADER)
      67          translit_file.write('\n')
      68          translit_file.write('% Transliterations of CJK compatibility ')
      69          translit_file.write('characters.\n')
      70          translit_file.write('% Generated automatically from UnicodeData.txt '
      71                              + 'by gen_translit_cjk_compat.py '
      72                              + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
      73                              + 'for Unicode {:s}.\n'.format(unicode_version))
      74          translit_file.write('\n')
      75          translit_file.write('LC_CTYPE\n')
      76          translit_file.write('\n')
      77          translit_file.write('translit_start\n')
      78  
      79  def output_tail(translit_file, tail=''):
      80      '''Write the tail of the output file'''
      81      if ARGS.input_file and tail:
      82          translit_file.write(tail)
      83      else:
      84          translit_file.write('translit_end\n')
      85          translit_file.write('\n')
      86          translit_file.write('END LC_CTYPE\n')
      87  
      88  def special_decompose(code_point_list):
      89      '''
      90      Decompositions which are not in UnicodeData.txt at all but which
      91      were used in the original translit_cjk_compat file in glibc and
      92      which seem to make sense.  I want to keep the update of
      93      translit_cjk_compat close to the spirit of the original file,
      94      therefore I added this special decomposition rules here.
      95      '''
      96      special_decompose_dict = {
      97          (0x2215,): [0x002F], # ∕ → /
      98          (0x00B2,): [0x005E, 0x0032], # ² → ^2
      99          (0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN)
     100          (0x2113,): [0x006C], # ℓ → l
     101          (0x00B3,): [0x005E, 0x0033], # ³ → ^3
     102          (0x00B5,): [0x0075], # µ → u
     103          (0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl
     104          (0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [
     105              0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2],
     106          (0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2],
     107      }
     108      if tuple(code_point_list) in special_decompose_dict:
     109          return special_decompose_dict[tuple(code_point_list)]
     110      else:
     111          return code_point_list
     112  
     113  def output_transliteration(translit_file):
     114      '''Write the new transliteration to the output file'''
     115      translit_file.write('\n')
     116      for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
     117          name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
     118          decomposition = unicode_utils.UNICODE_ATTRIBUTES[
     119              code_point]['decomposition']
     120          if decomposition.startswith('<square>'):
     121              decomposition = decomposition[9:]
     122              decomposed_code_points = [[int(x, 16)
     123                                         for x in decomposition.split(' ')]]
     124              if decomposed_code_points[0]:
     125                  while True:
     126                      special_decomposed_code_points = special_decompose(
     127                          decomposed_code_points[-1])
     128                      if (special_decomposed_code_points
     129                              != decomposed_code_points[-1]):
     130                          decomposed_code_points.append(
     131                              special_decomposed_code_points)
     132                          continue
     133                      special_decomposed_code_points = []
     134                      for decomposed_code_point in decomposed_code_points[-1]:
     135                          special_decomposed_code_points += special_decompose(
     136                              [decomposed_code_point])
     137                      if (special_decomposed_code_points
     138                              == decomposed_code_points[-1]):
     139                          break
     140                      decomposed_code_points.append(
     141                          special_decomposed_code_points)
     142                  translit_file.write('% {:s}\n'.format(name))
     143                  translit_file.write('{:s} '.format(
     144                      unicode_utils.ucs_symbol(code_point)))
     145                  for index in range(0, len(decomposed_code_points)):
     146                      if index > 0:
     147                          translit_file.write(';')
     148                      if len(decomposed_code_points[index]) > 1:
     149                          translit_file.write('"')
     150                      for decomposed_code_point in decomposed_code_points[index]:
     151                          translit_file.write('{:s}'.format(
     152                              unicode_utils.ucs_symbol(decomposed_code_point)))
     153                      if len(decomposed_code_points[index]) > 1:
     154                          translit_file.write('"')
     155                  translit_file.write('\n')
     156      for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
     157          name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
     158          decomposition = unicode_utils.UNICODE_ATTRIBUTES[
     159              code_point]['decomposition']
     160          if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
     161              decomposed_code_points = [int(x, 16)
     162                                        for x in decomposition.split(' ')]
     163              if len(decomposed_code_points) != 1:
     164                  sys.stderr.write(
     165                      'Unexpected decomposition length {:x} {:s} {:s}\n'.format(
     166                          code_point, name, decomposition))
     167                  exit(1)
     168              translit_file.write('% {:s}\n'.format(name))
     169              translit_file.write('{:s} '.format(
     170                  unicode_utils.ucs_symbol(code_point)))
     171              for decomposed_code_point in decomposed_code_points:
     172                  translit_file.write('{:s}'.format(
     173                      unicode_utils.ucs_symbol(decomposed_code_point)))
     174              translit_file.write('\n')
     175      translit_file.write('\n')
     176  
     177  if __name__ == "__main__":
     178      PARSER = argparse.ArgumentParser(
     179          description='''
     180          Generate a translit_cjk_compat file from UnicodeData.txt.
     181          ''')
     182      PARSER.add_argument(
     183          '-u', '--unicode_data_file',
     184          nargs='?',
     185          type=str,
     186          default='UnicodeData.txt',
     187          help=('The UnicodeData.txt file to read, '
     188                + 'default: %(default)s'))
     189      PARSER.add_argument(
     190          '-i', '--input_file',
     191          nargs='?',
     192          type=str,
     193          help=''' The original glibc/localedata/locales/translit_cjk_compat
     194          file.''')
     195      PARSER.add_argument(
     196          '-o', '--output_file',
     197          nargs='?',
     198          type=str,
     199          default='translit_cjk_compat.new',
     200          help='''The new translit_cjk_compat file, default: %(default)s.  If the
     201          original glibc/localedata/locales/translit_cjk_compat file has
     202          been given as an option, the header up to the
     203          “translit_start” line and the tail from the “translit_end”
     204          line to the end of the file will be copied unchanged into the
     205          output file.  ''')
     206      PARSER.add_argument(
     207          '--unicode_version',
     208          nargs='?',
     209          required=True,
     210          type=str,
     211          help='The Unicode version of the input files used.')
     212      ARGS = PARSER.parse_args()
     213  
     214      unicode_utils.fill_attributes(ARGS.unicode_data_file)
     215      HEAD = TAIL = ''
     216      if ARGS.input_file:
     217          (HEAD, TAIL) = read_input_file(ARGS.input_file)
     218      with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
     219          output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
     220          output_transliteration(TRANSLIT_FILE)
     221          output_tail(TRANSLIT_FILE, tail=TAIL)