1  #!/usr/bin/python3
       2  # -*- coding: utf-8 -*-
       3  #
       4  # Generate a translit_combining file from a UnicodeData file.
       5  # Copyright (C) 2015-2023 Free Software Foundation, Inc.
       6  # This file is part of the GNU C Library.
       7  #
       8  # The GNU C Library is free software; you can redistribute it and/or
       9  # modify it under the terms of the GNU Lesser General Public
      10  # License as published by the Free Software Foundation; either
      11  # version 2.1 of the License, or (at your option) any later version.
      12  #
      13  # The GNU C Library is distributed in the hope that it will be useful,
      14  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16  # Lesser General Public License for more details.
      17  #
      18  # You should have received a copy of the GNU Lesser General Public
      19  # License along with the GNU C Library; if not, see
      20  # <https://www.gnu.org/licenses/>.
      21  
      22  '''
      23  Generate a translit_combining file from UnicodeData.txt
      24  
      25  To see how this script is used, call it with the “-h” option:
      26  
      27      $ ./gen_translit_combining -h
      28      … prints usage message …
      29  '''
      30  
      31  import argparse
      32  import time
      33  import unicode_utils
      34  
      35  def read_input_file(filename):
      36      '''Reads the original glibc translit_combining file to get the
      37      original head and tail.
      38  
      39      We want to replace only the part of the file between
      40      “translit_start” and “translit_end”
      41      '''
      42      head = tail = ''
      43      with open(filename, mode='r') as translit_file:
      44          for line in translit_file:
      45              head = head + line
      46              if line.startswith('translit_start'):
      47                  break
      48          for line in translit_file:
      49              if line.startswith('translit_end'):
      50                  tail = line
      51                  break
      52          for line in translit_file:
      53              tail = tail + line
      54      return (head, tail)
      55  
      56  def output_head(translit_file, unicode_version, head=''):
      57      '''Write the header of the output file, i.e. the part of the file
      58      before the “translit_start” line.
      59      '''
      60      if ARGS.input_file and head:
      61          translit_file.write(head)
      62      else:
      63          translit_file.write('escape_char /\n')
      64          translit_file.write('comment_char %\n')
      65          translit_file.write(unicode_utils.COMMENT_HEADER)
      66          translit_file.write('\n')
      67          translit_file.write('% Transliterations that remove all ')
      68          translit_file.write('combining characters (accents,\n')
      69          translit_file.write('% pronounciation marks, etc.).\n')
      70          translit_file.write('% Generated automatically from UnicodeData.txt '
      71                              + 'by gen_translit_combining.py '
      72                              + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
      73                              + 'for Unicode {:s}.\n'.format(unicode_version))
      74          translit_file.write('\n')
      75          translit_file.write('LC_CTYPE\n')
      76          translit_file.write('\n')
      77          translit_file.write('translit_start\n')
      78  
      79  def output_tail(translit_file, tail=''):
      80      '''Write the tail of the output file'''
      81      if ARGS.input_file and tail:
      82          translit_file.write(tail)
      83      else:
      84          translit_file.write('translit_end\n')
      85          translit_file.write('\n')
      86          translit_file.write('END LC_CTYPE\n')
      87  
      88  def is_combining_remove(code_point):
      89      '''Check whether this is a combining character which should be listed
      90      in the section of the translit_combining file where combining
      91      characters are replaced by empty strings.
      92  
      93      We ignore combining characters from many scripts here because
      94      the original translit_combining file didn’t do this for the
      95      combining characters from these scripts either and I am not
      96      sure yet whether this would be useful to do for all combining
      97      characters or not. For the moment I think it is better to keep
      98      close to the spirit of the original file.
      99      '''
     100      if not unicode_utils.is_combining(code_point):
     101          return False
     102      name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
     103      for substring in ('DEVANAGARI',
     104                        'BENGALI',
     105                        'CYRILLIC',
     106                        'SYRIAC',
     107                        'THAANA',
     108                        'NKO',
     109                        'GURMUKHI',
     110                        'TAMIL',
     111                        'GUJARATI',
     112                        'ORIYA',
     113                        'TELUGU',
     114                        'KANNADA',
     115                        'MALAYALAM',
     116                        'SINHALA',
     117                        'THAI',
     118                        'LAO',
     119                        'TIBETAN',
     120                        'MYANMAR',
     121                        'ETHIOPIC',
     122                        'TAGALOG',
     123                        'HANUNOO',
     124                        'BUHID',
     125                        'TAGBANWA',
     126                        'KHMER',
     127                        'MONGOLIAN',
     128                        'LIMBU',
     129                        'NEW TAI LUE',
     130                        'BUGINESE',
     131                        'BALINESE',
     132                        'SUNDANESE',
     133                        'LEPCHA',
     134                        'IDEOGRAPHIC',
     135                        'HANGUL',
     136                        'SYLOTI',
     137                        'SAURASHTRA',
     138                        'KAYAH',
     139                        'REJANG',
     140                        'CHAM',
     141                        'VARIATION SELECTOR',
     142                        'KHAROSHTHI',
     143                        'MUSICAL SYMBOL',
     144                        'SAMARITAN',
     145                        'MANDAIC',
     146                        'TAI THAM',
     147                        'BATAK',
     148                        'VEDIC',
     149                        'COPTIC',
     150                        'TIFINAGH',
     151                        'BAMUM',
     152                        'JAVANESE',
     153                        'TAI VIET',
     154                        'MEETEI',
     155                        'MANICHAEAN',
     156                        'BRAHMI',
     157                        'KAITHI',
     158                        'CHAKMA',
     159                        'MAHAJANI',
     160                        'SHARADA',
     161                        'KHOJKI',
     162                        'KHUDAWADI',
     163                        'GRANTHA',
     164                        'TIRHUTA',
     165                        'SIDDHAM',
     166                        'MODI VOWEL',
     167                        'MODI SIGN',
     168                        'TAKRI',
     169                        'BASSA VAH',
     170                        'PAHAWH HMONG',
     171                        'MIAO',
     172                        'DUPLOYAN',
     173                        'MENDE KIKAKUI',
     174                        'AHOM',
     175                        'SIGNWRITING'
     176      ):
     177          if substring in name:
     178              return False
     179      return True
     180  
     181  def canonical_decompose(code_point):
     182      '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
     183  
     184      In some instances a canonical mapping or a compatibility mapping
     185      may consist of a single character. For a canonical mapping, this
     186      indicates that the character is a canonical equivalent of another
     187      single character. For a compatibility mapping, this indicates that
     188      the character is a compatibility equivalent of another single
     189      character.
     190  
     191      A canonical mapping may also consist of a pair of characters, but
     192      is never longer than two characters. When a canonical mapping
     193      consists of a pair of characters, the first character may itself
     194      be a character with a decomposition mapping, but the second
     195      character never has a decomposition mapping.
     196  
     197      We ignore the canonical decomposition for code points
     198      matching certain substrings because the original translit_combining
     199      file didn’t include these types of characters either. I am unsure
     200      about the usefulness of including them and want to keep close
     201      to the spirit of the original file for the moment.
     202      '''
     203      name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
     204      for substring in ('MUSICAL SYMBOL',
     205                        'CJK COMPATIBILITY IDEOGRAPH',
     206                        'BALINESE',
     207                        'KAITHI LETTER',
     208                        'CHAKMA VOWEL',
     209                        'GRANTHA VOWEL',
     210                        'TIRHUTA VOWEL',
     211                        'SIDDHAM VOWEL'):
     212          if substring in name:
     213              return []
     214      decomposition = unicode_utils.UNICODE_ATTRIBUTES[
     215          code_point]['decomposition']
     216      if decomposition and not decomposition.startswith('<'):
     217          decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')]
     218          if decomposed_code_points:
     219              cd0 = canonical_decompose(decomposed_code_points[0])
     220              if cd0:
     221                  decomposed_code_points = cd0 + decomposed_code_points[1:]
     222          return decomposed_code_points
     223      else:
     224          return []
     225  
     226  def special_decompose(code_point_list):
     227      '''
     228      Decompositions which are not canonical or which are not in
     229      UnicodeData.txt at all but some of these were used in the original
     230      translit_combining file in glibc and they seemed to make sense.
     231      I want to keep the update of translit_combining close to the
     232      spirit of the original file, therefore I added these special
     233      decomposition rules here.
     234      '''
     235      special_decompose_dict = {
     236          # Ø U+00D8 is already handled in translit_neutral. But
     237          # translit_combining is usually included after translit_neutral
     238          # and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
     239          # has a canonical decomposition to Ø U+00D8 and we want to
     240          # further decompose this to U+004F.
     241          (0x00D8,): [0x004F], # Ø → O
     242          # ø U+00F8 is already handled in translit_neutral. But
     243          # translit_combining is usually included after translit_neutral
     244          # and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
     245          # has a canonical decomposition to ø U+00F8 and we want to
     246          # further decompose this to U+006F.
     247          (0x00F8,): [0x006F], # ø → o
     248          # æ U+00E6 is already in translit_compat because ligatures
     249          # are handled in translit_compat. But ǣ U+01E3 has a
     250          # canonical decomposition to U+00E6, U+0304 and we want to
     251          # further decompose this to “ae”.
     252          (0x00E6,): [0x0061, 0x0065], # æ → ae
     253          # Æ U+00C6  is already in translit_compat because ligatures
     254          # are handled in translit_compat. But Ǣ U+01E2 has a
     255          # canonical decomposition to U+00C6, U+0304 and we want to
     256          # further decompose this to “AE”
     257          (0x00C6,): [0x0041, 0x0045], # Æ → AE
     258          # U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
     259          # translit_compat because ligatures are handled in translit_compat.
     260          # But U+FB1F has a canonical decomposition to U+05F2 and
     261          # we want to further decompose this to U+05D9, U+05D9.
     262          (0x05F2,): [0x05D9, 0x05D9], # ײ → יי
     263          # 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
     264          # But U+2000 EN QUAD has a canonical decomposition U+2002
     265          # and we want to further decompose this to U+0020.
     266          (0x2002,): [0x0020], # EN SPACE → SPACE
     267          # 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
     268          # But U+2001 EM QUAD has a canonical decomposition to U+2003
     269          # and we want to further decompose this to U+0020.
     270          (0x2003,): [0x0020], # EM SPACE → SPACE
     271          # U+2260 ≠ has the canonical decomposition U+003D U+0338
     272          # (= followed by ̸). After stripping the combining characters,
     273          # the result is only = which reverses the meaning.
     274          # Therefore, we add a special rules here for such mathematical
     275          # negations:
     276          (0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
     277          (0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
     278          (0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
     279          (0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
     280          (0x2204,): [0x0021, 0x2203], # ∄ → !∃
     281          (0x2209,): [0x0021, 0x2208], # ∉ → !∈
     282          (0x220C,): [0x0021, 0x220B], # ∌ → !∋
     283          (0x2224,): [0x0021, 0x2223], # ∤ → !∣
     284          (0x2226,): [0x0021, 0x2225], # ∦ → !∥
     285          (0x2241,): [0x0021, 0x007E], # ≁ → !~
     286          (0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
     287          (0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
     288          (0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
     289          (0x2260,): [0x0021, 0x003D], # ≠ → !=
     290          (0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
     291          (0x226D,): [0x0021, 0x224D], # ≭ → !≍
     292          (0x226E,): [0x0021, 0x003C], # ≮ → !<
     293          (0x226F,): [0x0021, 0x003E], # ≯ → !>
     294          (0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
     295          (0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
     296          (0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
     297          (0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
     298          (0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
     299          (0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
     300          (0x2280,): [0x0021, 0x227A], # ⊀ → !≺
     301          (0x2281,): [0x0021, 0x227B], # ⊁ → !≻
     302          (0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
     303          (0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
     304          (0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
     305          (0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
     306          (0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
     307          (0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
     308          (0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
     309          (0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
     310          (0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
     311          (0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
     312          (0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
     313          (0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
     314          (0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
     315          (0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
     316          (0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
     317          (0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
     318          (0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
     319          # Special rule for 〈 U+3008 is added
     320          # because 〉 U+2329 has the canonical decomposition U+3008
     321          # and we want to further decompose this to > U+003C.
     322          (0x3008,): [0x003C], # 〈 → <
     323          # Special rule for 〉 U+3009 is added
     324          # because 〉 U+232A has the canonical decomposition U+3009
     325          # and we want to further decompose this to < U+003E.
     326          (0x3009,): [0x003E], # 〉→ >
     327      }
     328      if tuple(code_point_list) in special_decompose_dict:
     329          return special_decompose_dict[tuple(code_point_list)]
     330      else:
     331          return code_point_list
     332  
     333  def output_combining_remove(translit_file):
     334      '''Write the section of the translit_combining file where combining
     335      characters are replaced by empty strings.
     336      '''
     337      translit_file.write('\n')
     338      for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
     339          name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
     340          if is_combining_remove(code_point):
     341              translit_file.write('% {:s}\n'.format(name))
     342              translit_file.write('{:s} ""\n'.format(
     343                  unicode_utils.ucs_symbol(code_point)))
     344      translit_file.write('\n')
     345  
     346  def output_decompositions(translit_file):
     347      '''Write the section of the translit_combining file where characters
     348      characters are decomposed and combining characters stripped from
     349      the decompositions.
     350      '''
     351      for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
     352          if special_decompose([code_point]) != [code_point]:
     353              decomposed_code_points = [special_decompose([code_point])]
     354          else:
     355              decomposed_code_points = [canonical_decompose(code_point)]
     356          if decomposed_code_points[0]:
     357              while True:
     358                  special_decomposed_code_points = special_decompose(
     359                      decomposed_code_points[-1])
     360                  if (special_decomposed_code_points
     361                          != decomposed_code_points[-1]):
     362                      decomposed_code_points.append(
     363                          special_decomposed_code_points)
     364                      continue
     365                  special_decomposed_code_points = []
     366                  for decomposed_code_point in decomposed_code_points[-1]:
     367                      special_decomposed_code_points += special_decompose(
     368                          [decomposed_code_point])
     369                  if (special_decomposed_code_points
     370                          == decomposed_code_points[-1]):
     371                      break
     372                  decomposed_code_points.append(
     373                      special_decomposed_code_points)
     374              for index in range(0, len(decomposed_code_points)):
     375                  decomposed_code_points[index] = [
     376                      x for x in decomposed_code_points[index]
     377                      if not is_combining_remove(x)]
     378          if decomposed_code_points[0]:
     379              translit_file.write('% {:s}\n'.format(
     380                  unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
     381              translit_file.write('{:s} '.format(
     382                  unicode_utils.ucs_symbol(code_point)))
     383              for index in range(0, len(decomposed_code_points)):
     384                  if index > 0:
     385                      translit_file.write(';')
     386                  if len(decomposed_code_points[index]) > 1:
     387                      translit_file.write('"')
     388                  for decomposed_code_point in decomposed_code_points[index]:
     389                      translit_file.write('{:s}'.format(
     390                          unicode_utils.ucs_symbol(decomposed_code_point)))
     391                  if len(decomposed_code_points[index]) > 1:
     392                      translit_file.write('"')
     393              translit_file.write('\n')
     394      translit_file.write('\n')
     395  
     396  def output_transliteration(translit_file):
     397      '''Write the new transliteration to the output file'''
     398      output_combining_remove(translit_file)
     399      output_decompositions(translit_file)
     400  
     401  if __name__ == "__main__":
     402      PARSER = argparse.ArgumentParser(
     403          description='''
     404          Generate a translit_combining file from UnicodeData.txt.
     405          ''')
     406      PARSER.add_argument(
     407          '-u', '--unicode_data_file',
     408          nargs='?',
     409          type=str,
     410          default='UnicodeData.txt',
     411          help=('The UnicodeData.txt file to read, '
     412                + 'default: %(default)s'))
     413      PARSER.add_argument(
     414          '-i', '--input_file',
     415          nargs='?',
     416          type=str,
     417          help=''' The original glibc/localedata/locales/translit_combining
     418          file.''')
     419      PARSER.add_argument(
     420          '-o', '--output_file',
     421          nargs='?',
     422          type=str,
     423          default='translit_combining.new',
     424          help='''The new translit_combining file, default: %(default)s.  If the
     425          original glibc/localedata/locales/translit_combining file has
     426          been given as an option, the header up to the
     427          “translit_start” line and the tail from the “translit_end”
     428          line to the end of the file will be copied unchanged into the
     429          output file.  ''')
     430      PARSER.add_argument(
     431          '--unicode_version',
     432          nargs='?',
     433          required=True,
     434          type=str,
     435          help='The Unicode version of the input files used.')
     436      ARGS = PARSER.parse_args()
     437  
     438      unicode_utils.fill_attributes(ARGS.unicode_data_file)
     439      HEAD = TAIL = ''
     440      if ARGS.input_file:
     441          (HEAD, TAIL) = read_input_file(ARGS.input_file)
     442      with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
     443          output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
     444          output_transliteration(TRANSLIT_FILE)
     445          output_tail(TRANSLIT_FILE, tail=TAIL)