1  #!/usr/bin/python3
       2  # -*- coding: utf-8 -*-
       3  #
       4  # Generate a translit_compat file from a UnicodeData file.
       5  # Copyright (C) 2015-2023 Free Software Foundation, Inc.
       6  # This file is part of the GNU C Library.
       7  #
       8  # The GNU C Library is free software; you can redistribute it and/or
       9  # modify it under the terms of the GNU Lesser General Public
      10  # License as published by the Free Software Foundation; either
      11  # version 2.1 of the License, or (at your option) any later version.
      12  #
      13  # The GNU C Library is distributed in the hope that it will be useful,
      14  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16  # Lesser General Public License for more details.
      17  #
      18  # You should have received a copy of the GNU Lesser General Public
      19  # License along with the GNU C Library; if not, see
      20  # <https://www.gnu.org/licenses/>.
      21  
      22  '''
      23  Generate a translit_compat file from UnicodeData.txt
      24  
      25  To see how this script is used, call it with the “-h” option:
      26  
      27      $ ./gen_translit_compat -h
      28      … prints usage message …
      29  '''
      30  
      31  import argparse
      32  import time
      33  import unicode_utils
      34  
      35  def read_input_file(filename):
      36      '''Reads the original glibc translit_compat file to get the
      37      original head and tail.
      38  
      39      We want to replace only the part of the file between
      40      “translit_start” and “translit_end”
      41      '''
      42      head = tail = ''
      43      with open(filename, mode='r') as translit_file:
      44          for line in translit_file:
      45              head = head + line
      46              if line.startswith('translit_start'):
      47                  break
      48          for line in translit_file:
      49              if line.startswith('translit_end'):
      50                  tail = line
      51                  break
      52          for line in translit_file:
      53              tail = tail + line
      54      return (head, tail)
      55  
      56  def output_head(translit_file, unicode_version, head=''):
      57      '''Write the header of the output file, i.e. the part of the file
      58      before the “translit_start” line.
      59      '''
      60      if ARGS.input_file and head:
      61          translit_file.write(head)
      62      else:
      63          translit_file.write('escape_char /\n')
      64          translit_file.write('comment_char %\n')
      65          translit_file.write(unicode_utils.COMMENT_HEADER)
      66          translit_file.write('\n')
      67          translit_file.write('% Transliterations of compatibility characters ')
      68          translit_file.write('and ligatures.\n')
      69          translit_file.write('% Generated automatically from UnicodeData.txt '
      70                              + 'by gen_translit_compat.py '
      71                              + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
      72                              + 'for Unicode {:s}.\n'.format(unicode_version))
      73          translit_file.write('\n')
      74          translit_file.write('LC_CTYPE\n')
      75          translit_file.write('\n')
      76          translit_file.write('translit_start\n')
      77  
      78  def output_tail(translit_file, tail=''):
      79      '''Write the tail of the output file'''
      80      if ARGS.input_file and tail:
      81          translit_file.write(tail)
      82      else:
      83          translit_file.write('translit_end\n')
      84          translit_file.write('\n')
      85          translit_file.write('END LC_CTYPE\n')
      86  
      87  def compatibility_decompose(code_point):
      88      '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
      89  
      90      “The compatibility decomposition is formed by recursively applying
      91      the canonical and compatibility mappings, then applying the
      92      Canonical Ordering Algorithm.”
      93  
      94      We don’t do the canonical decomposition here because this is
      95      done in gen_translit_combining.py to generate translit_combining.
      96  
      97      And we ignore some of the possible compatibility formatting tags
      98      here. Some of them are used in other translit_* files, not
      99      translit_compat:
     100  
     101      <font>:   translit_font
     102      <circle>: translit_circle
     103      <wide>:   translit_wide
     104      <narrow>: translit_narrow
     105      <square>: translit_cjk_compat
     106      <fraction>: translit_fraction
     107  
     108      And we ignore
     109  
     110      <noBreak>, <initial>, <medial>, <final>, <isolated>
     111  
     112      because they seem to be not useful for transliteration.
     113      '''
     114      decomposition = unicode_utils.UNICODE_ATTRIBUTES[
     115          code_point]['decomposition']
     116      compatibility_tags = (
     117          '<compat>', '<super>', '<sub>', '<vertical>')
     118      for compatibility_tag in compatibility_tags:
     119          if decomposition.startswith(compatibility_tag):
     120              decomposition = decomposition[len(compatibility_tag)+1:]
     121              decomposed_code_points = [int(x, 16)
     122                                        for x in decomposition.split(' ')]
     123              if (len(decomposed_code_points) > 1
     124                      and decomposed_code_points[0] == 0x0020
     125                      and decomposed_code_points[1] >= 0x0300
     126                      and decomposed_code_points[1] <= 0x03FF):
     127                  # Decomposes into a space followed by a combining character.
     128                  # This is not useful fo transliteration.
     129                  return []
     130              else:
     131                  return_value = []
     132                  for index in range(0, len(decomposed_code_points)):
     133                      cd_code_points = compatibility_decompose(
     134                          decomposed_code_points[index])
     135                      if cd_code_points:
     136                          return_value += cd_code_points
     137                      else:
     138                          return_value += [decomposed_code_points[index]]
     139                  return return_value
     140      return []
     141  
     142  def special_decompose(code_point_list):
     143      '''
     144      Decompositions which are not in UnicodeData.txt at all but which
     145      were used in the original translit_compat file in glibc and
     146      which seem to make sense.  I want to keep the update of
     147      translit_compat close to the spirit of the original file,
     148      therefore I added this special decomposition rules here.
     149      '''
     150      special_decompose_dict = {
     151          (0x03BC,): [0x0075], # μ → u
     152          (0x02BC,): [0x0027], # ʼ → '
     153      }
     154      if tuple(code_point_list) in special_decompose_dict:
     155          return special_decompose_dict[tuple(code_point_list)]
     156      else:
     157          return code_point_list
     158  
     159  def special_ligature_decompose(code_point):
     160      '''
     161      Decompositions for ligatures which are not in UnicodeData.txt at
     162      all but which were used in the original translit_compat file in
     163      glibc and which seem to make sense.  I want to keep the update of
     164      translit_compat close to the spirit of the original file,
     165      therefore I added these special ligature decomposition rules here.
     166  
     167      '''
     168      special_ligature_decompose_dict = {
     169          0x00E6: [0x0061, 0x0065], # æ → ae
     170          0x00C6: [0x0041, 0x0045], # Æ → AE
     171          # These following 5 special ligature decompositions were
     172          # in the original glibc/localedata/locales/translit_compat file
     173          0x0152: [0x004F, 0x0045], # Œ → OE
     174          0x0153: [0x006F, 0x0065], # œ → oe
     175          0x05F0: [0x05D5, 0x05D5], # װ → וו
     176          0x05F1: [0x05D5, 0x05D9], # ױ → וי
     177          0x05F2: [0x05D9, 0x05D9], # ײ → יי
     178          # The following special ligature decompositions were
     179          # not in the original glibc/localedata/locales/translit_compat file
     180          # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
     181          # → U+041D CYRILLIC CAPITAL LETTER EN,
     182          #   U+0413 CYRILLIC CAPITAL LETTER GHE
     183          0x04A4: [0x041D, 0x0413], # Ҥ → НГ
     184          # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
     185          # → U+043D CYRILLIC SMALL LETTER EN,
     186          #   U+0433 CYRILLIC SMALL LETTER GHE
     187          0x04A5: [0x043D, 0x0433], # ҥ → нг
     188          # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
     189          # → U+0422 CYRILLIC CAPITAL LETTER TE,
     190          #   U+0426 CYRILLIC CAPITAL LETTER TSE
     191          0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
     192          # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
     193          # → U+0442 CYRILLIC SMALL LETTER TE,
     194          #   U+0446 CYRILLIC SMALL LETTER TSE
     195          0x04B5: [0x0442, 0x0446], # ҵ → тц
     196          # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
     197          # → U+0410 CYRILLIC CAPITAL LETTER A
     198          #   U+0415;CYRILLIC CAPITAL LETTER IE
     199          0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
     200          # U+04D5 CYRILLIC SMALL LIGATURE A IE
     201          # → U+0430 CYRILLIC SMALL LETTER A,
     202          #   U+0435 CYRILLIC SMALL LETTER IE
     203          0x04D5: [0x0430, 0x0435], # ӕ → ае
     204          # I am not sure what to do with the following ligatures
     205          # maybe it makes no sense to decompose them:
     206          # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
     207          # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
     208          # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
     209          # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
     210          # U+fe20 COMBINING LIGATURE LEFT HALF
     211          # U+fe21 COMBINING LIGATURE RIGHT HALF
     212          # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
     213          # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
     214          # U+11176 MAHAJANI LIGATURE SHRI
     215          # U+1f670 SCRIPT LIGATURE ET ORNAMENT
     216          # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
     217          # U+1f672 LIGATURE OPEN ET ORNAMENT
     218          # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
     219      }
     220      if code_point in special_ligature_decompose_dict:
     221          return special_ligature_decompose_dict[code_point]
     222      else:
     223          return [code_point]
     224  
     225  def output_transliteration(translit_file):
     226      '''Write the new transliteration to the output file'''
     227      translit_file.write('\n')
     228      for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
     229          name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
     230          decomposed_code_points = [compatibility_decompose(code_point)]
     231          if not decomposed_code_points[0]:
     232              if special_decompose([code_point]) != [code_point]:
     233                  decomposed_code_points[0] = special_decompose([code_point])
     234          else:
     235              special_decomposed_code_points = []
     236              while True:
     237                  special_decomposed_code_points = special_decompose(
     238                      decomposed_code_points[-1])
     239                  if (special_decomposed_code_points
     240                          != decomposed_code_points[-1]):
     241                      decomposed_code_points.append(
     242                          special_decomposed_code_points)
     243                      continue
     244                  special_decomposed_code_points = []
     245                  for decomposed_code_point in decomposed_code_points[-1]:
     246                      special_decomposed_code_points += special_decompose(
     247                          [decomposed_code_point])
     248                  if (special_decomposed_code_points
     249                          == decomposed_code_points[-1]):
     250                      break
     251                  decomposed_code_points.append(
     252                      special_decomposed_code_points)
     253          if decomposed_code_points[0]:
     254              translit_file.write('% {:s}\n'.format(name))
     255              translit_file.write('{:s} '.format(
     256                  unicode_utils.ucs_symbol(code_point)))
     257              for index in range(0, len(decomposed_code_points)):
     258                  if index > 0:
     259                      translit_file.write(';')
     260                  translit_file.write('"')
     261                  for decomposed_code_point in decomposed_code_points[index]:
     262                      translit_file.write('{:s}'.format(
     263                          unicode_utils.ucs_symbol(decomposed_code_point)))
     264                  translit_file.write('"')
     265              translit_file.write('\n')
     266          elif 'LIGATURE' in name and 'ARABIC' not in name:
     267              decomposed_code_points = special_ligature_decompose(code_point)
     268              if decomposed_code_points[0] != code_point:
     269                  translit_file.write('% {:s}\n'.format(name))
     270                  translit_file.write('{:s} '.format(
     271                      unicode_utils.ucs_symbol(code_point)))
     272                  translit_file.write('"')
     273                  for decomposed_code_point in decomposed_code_points:
     274                      translit_file.write('{:s}'.format(
     275                          unicode_utils.ucs_symbol(decomposed_code_point)))
     276                  translit_file.write('"')
     277                  translit_file.write('\n')
     278              else:
     279                  print('Warning: unhandled ligature: {:x} {:s}'.format(
     280                      code_point, name))
     281      translit_file.write('\n')
     282  
     283  if __name__ == "__main__":
     284      PARSER = argparse.ArgumentParser(
     285          description='''
     286          Generate a translit_compat file from UnicodeData.txt.
     287          ''')
     288      PARSER.add_argument(
     289          '-u', '--unicode_data_file',
     290          nargs='?',
     291          type=str,
     292          default='UnicodeData.txt',
     293          help=('The UnicodeData.txt file to read, '
     294                + 'default: %(default)s'))
     295      PARSER.add_argument(
     296          '-i', '--input_file',
     297          nargs='?',
     298          type=str,
     299          help=''' The original glibc/localedata/locales/translit_compat
     300          file.''')
     301      PARSER.add_argument(
     302          '-o', '--output_file',
     303          nargs='?',
     304          type=str,
     305          default='translit_compat.new',
     306          help='''The new translit_compat file, default: %(default)s.  If the
     307          original glibc/localedata/locales/translit_compat file has
     308          been given as an option, the header up to the
     309          “translit_start” line and the tail from the “translit_end”
     310          line to the end of the file will be copied unchanged into the
     311          output file.  ''')
     312      PARSER.add_argument(
     313          '--unicode_version',
     314          nargs='?',
     315          required=True,
     316          type=str,
     317          help='The Unicode version of the input files used.')
     318      ARGS = PARSER.parse_args()
     319  
     320      unicode_utils.fill_attributes(ARGS.unicode_data_file)
     321      HEAD = TAIL = ''
     322      if ARGS.input_file:
     323          (HEAD, TAIL) = read_input_file(ARGS.input_file)
     324      with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
     325          output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
     326          output_transliteration(TRANSLIT_FILE)
     327          output_tail(TRANSLIT_FILE, tail=TAIL)