1  #!/usr/bin/python3
       2  # -*- coding: utf-8 -*-
       3  # Copyright (C) 2014-2023 Free Software Foundation, Inc.
       4  # This file is part of the GNU C Library.
       5  #
       6  # The GNU C Library is free software; you can redistribute it and/or
       7  # modify it under the terms of the GNU Lesser General Public
       8  # License as published by the Free Software Foundation; either
       9  # version 2.1 of the License, or (at your option) any later version.
      10  #
      11  # The GNU C Library is distributed in the hope that it will be useful,
      12  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
      14  # Lesser General Public License for more details.
      15  #
      16  # You should have received a copy of the GNU Lesser General Public
      17  # License along with the GNU C Library; if not, see
      18  # <https://www.gnu.org/licenses/>.
      19  
      20  '''
      21  This script is useful for checking backward compatibility of newly
      22  generated UTF-8 file from utf8_gen.py script
      23  
      24  To see how this script is used, call it with the “-h” option:
      25  
      26      $ ./utf8_compatibility.py -h
      27      … prints usage message …
      28  '''
      29  
      30  import sys
      31  import re
      32  import argparse
      33  import unicode_utils
      34  
      35  def create_charmap_dictionary(file_name):
      36      '''Create a dictionary for all code points found in the CHARMAP
      37      section of a file
      38      '''
      39      with open(file_name, mode='r') as utf8_file:
      40          charmap_dictionary = {}
      41          for line in utf8_file:
      42              if line.startswith('CHARMAP'):
      43                  break
      44          for line in utf8_file:
      45              if line.startswith('END CHARMAP'):
      46                  return charmap_dictionary
      47              if line.startswith('%'):
      48                  continue
      49              match = re.match(
      50                  r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
      51                  +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
      52                  +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
      53                  line)
      54              if not match:
      55                  continue
      56              codepoint1 = match.group('codepoint1')
      57              codepoint2 = match.group('codepoint2')
      58              if not codepoint2:
      59                  codepoint2 = codepoint1
      60              for i in range(int(codepoint1, 16),
      61                             int(codepoint2, 16) + 1):
      62                  charmap_dictionary[i] = match.group('hexutf8')
      63          sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
      64                           %file_name)
      65          exit(1)
      66  
      67  def check_charmap(original_file_name, new_file_name):
      68      '''Report differences in the CHARMAP section between the old and the
      69      new file
      70      '''
      71      print('************************************************************')
      72      print('Report on CHARMAP:')
      73      ocharmap = create_charmap_dictionary(original_file_name)
      74      ncharmap = create_charmap_dictionary(new_file_name)
      75      print('------------------------------------------------------------')
      76      print('Total removed characters in newly generated CHARMAP: %d'
      77            %len(set(ocharmap)-set(ncharmap)))
      78      if ARGS.show_missing_characters:
      79          for key in sorted(set(ocharmap)-set(ncharmap)):
      80              print('removed: {:s}     {:s} {:s}'.format(
      81                  unicode_utils.ucs_symbol(key),
      82                  ocharmap[key],
      83                  unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
      84                  if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
      85      print('------------------------------------------------------------')
      86      changed_charmap = {}
      87      for key in set(ocharmap).intersection(set(ncharmap)):
      88          if ocharmap[key] != ncharmap[key]:
      89              changed_charmap[key] = (ocharmap[key], ncharmap[key])
      90      print('Total changed characters in newly generated CHARMAP: %d'
      91            %len(changed_charmap))
      92      if ARGS.show_changed_characters:
      93          for key in sorted(changed_charmap):
      94              print('changed: {:s}     {:s}->{:s} {:s}'.format(
      95                  unicode_utils.ucs_symbol(key),
      96                  changed_charmap[key][0],
      97                  changed_charmap[key][1],
      98                  unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
      99                  if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     100      print('------------------------------------------------------------')
     101      print('Total added characters in newly generated CHARMAP: %d'
     102            %len(set(ncharmap)-set(ocharmap)))
     103      if ARGS.show_added_characters:
     104          for key in sorted(set(ncharmap)-set(ocharmap)):
     105              print('added: {:s}     {:s} {:s}'.format(
     106                  unicode_utils.ucs_symbol(key),
     107                  ncharmap[key],
     108                  unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
     109                  if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     110  
     111  def create_width_dictionary(file_name):
     112      '''Create a dictionary for all code points found in the WIDTH
     113      section of a file
     114      '''
     115      with open(file_name, mode='r') as utf8_file:
     116          width_dictionary = {}
     117          for line in utf8_file:
     118              if line.startswith('WIDTH'):
     119                  break
     120          for line in utf8_file:
     121              if line.startswith('END WIDTH'):
     122                  return width_dictionary
     123              match = re.match(
     124                  r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
     125                  +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
     126                  +r'\s+(?P<width>[02])',
     127                  line)
     128              if not match:
     129                  continue
     130              codepoint1 = match.group('codepoint1')
     131              codepoint2 = match.group('codepoint2')
     132              if not codepoint2:
     133                  codepoint2 = codepoint1
     134              for i in range(int(codepoint1, 16),
     135                             int(codepoint2, 16) + 1):
     136                  width_dictionary[i] = int(match.group('width'))
     137          sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
     138  
     139  def check_width(original_file_name, new_file_name):
     140      '''Report differences in the WIDTH section between the old and the new
     141      file
     142      '''
     143      print('************************************************************')
     144      print('Report on WIDTH:')
     145      owidth = create_width_dictionary(original_file_name)
     146      nwidth = create_width_dictionary(new_file_name)
     147      print('------------------------------------------------------------')
     148      print('Total removed characters in newly generated WIDTH: %d'
     149            %len(set(owidth)-set(nwidth)))
     150      print('(Characters not in WIDTH get width 1 by default, '
     151            + 'i.e. these have width 1 now.)')
     152      if ARGS.show_missing_characters:
     153          for key in sorted(set(owidth)-set(nwidth)):
     154              print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
     155                    + '{:d} : '.format(owidth[key])
     156                    + 'eaw={:s} '.format(
     157                        unicode_utils.EAST_ASIAN_WIDTHS[key]
     158                        if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
     159                    + 'category={:2s} '.format(
     160                        unicode_utils.UNICODE_ATTRIBUTES[key]['category']
     161                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
     162                    + 'bidi={:3s} '.format(
     163                        unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
     164                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
     165                    + 'name={:s}'.format(
     166                        unicode_utils.UNICODE_ATTRIBUTES[key]['name']
     167                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     168      print('------------------------------------------------------------')
     169      changed_width = {}
     170      for key in set(owidth).intersection(set(nwidth)):
     171          if owidth[key] != nwidth[key]:
     172              changed_width[key] = (owidth[key], nwidth[key])
     173      print('Total changed characters in newly generated WIDTH: %d'
     174            %len(changed_width))
     175      if ARGS.show_changed_characters:
     176          for key in sorted(changed_width):
     177              print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
     178                    + '{:d}->{:d} : '.format(changed_width[key][0],
     179                                            changed_width[key][1])
     180                    + 'eaw={:s} '.format(
     181                        unicode_utils.EAST_ASIAN_WIDTHS[key]
     182                        if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
     183                    + 'category={:2s} '.format(
     184                        unicode_utils.UNICODE_ATTRIBUTES[key]['category']
     185                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
     186                    + 'bidi={:3s} '.format(
     187                        unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
     188                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
     189                    + 'name={:s}'.format(
     190                        unicode_utils.UNICODE_ATTRIBUTES[key]['name']
     191                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     192      print('------------------------------------------------------------')
     193      print('Total added characters in newly generated WIDTH: %d'
     194            %len(set(nwidth)-set(owidth)))
     195      print('(Characters not in WIDTH get width 1 by default, '
     196            + 'i.e. these had width 1 before.)')
     197      if ARGS.show_added_characters:
     198          for key in sorted(set(nwidth)-set(owidth)):
     199              print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
     200                    + '{:d} : '.format(nwidth[key])
     201                    + 'eaw={:s} '.format(
     202                        unicode_utils.EAST_ASIAN_WIDTHS[key]
     203                        if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
     204                    + 'category={:2s} '.format(
     205                        unicode_utils.UNICODE_ATTRIBUTES[key]['category']
     206                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
     207                    + 'bidi={:3s} '.format(
     208                        unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
     209                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
     210                    + 'name={:s}'.format(
     211                        unicode_utils.UNICODE_ATTRIBUTES[key]['name']
     212                        if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
     213  
     214  if __name__ == "__main__":
     215      PARSER = argparse.ArgumentParser(
     216          description='''
     217          Compare the contents of LC_CTYPE in two files and check for errors.
     218          ''')
     219      PARSER.add_argument(
     220          '-o', '--old_utf8_file',
     221          nargs='?',
     222          required=True,
     223          type=str,
     224          help='The old UTF-8 file.')
     225      PARSER.add_argument(
     226          '-n', '--new_utf8_file',
     227          nargs='?',
     228          required=True,
     229          type=str,
     230          help='The new UTF-8 file.')
     231      PARSER.add_argument(
     232          '-u', '--unicode_data_file',
     233          nargs='?',
     234          type=str,
     235          help='The UnicodeData.txt file to read.')
     236      PARSER.add_argument(
     237          '-e', '--east_asian_width_file',
     238          nargs='?',
     239          type=str,
     240          help='The EastAsianWidth.txt file to read.')
     241      PARSER.add_argument(
     242          '-a', '--show_added_characters',
     243          action='store_true',
     244          help='Show characters which were added in detail.')
     245      PARSER.add_argument(
     246          '-m', '--show_missing_characters',
     247          action='store_true',
     248          help='Show characters which were removed in detail.')
     249      PARSER.add_argument(
     250          '-c', '--show_changed_characters',
     251          action='store_true',
     252          help='Show characters whose width was changed in detail.')
     253      ARGS = PARSER.parse_args()
     254  
     255      if ARGS.unicode_data_file:
     256          unicode_utils.fill_attributes(ARGS.unicode_data_file)
     257      if ARGS.east_asian_width_file:
     258          unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
     259      check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
     260      check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)