1  #!/usr/bin/python3
       2  # -*- coding: utf-8 -*-
       3  # Copyright (C) 2014-2023 Free Software Foundation, Inc.
       4  # This file is part of the GNU C Library.
       5  #
       6  # The GNU C Library is free software; you can redistribute it and/or
       7  # modify it under the terms of the GNU Lesser General Public
       8  # License as published by the Free Software Foundation; either
       9  # version 2.1 of the License, or (at your option) any later version.
      10  #
      11  # The GNU C Library is distributed in the hope that it will be useful,
      12  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
      14  # Lesser General Public License for more details.
      15  #
      16  # You should have received a copy of the GNU Lesser General Public
      17  # License along with the GNU C Library; if not, see
      18  # <https://www.gnu.org/licenses/>.
      19  
      20  '''
      21  This script is useful for checking the differences between
      22  an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
      23  new one generated by gen_unicode_ctype.py
      24  
      25  To see how it is used, call it with the “-h” option:
      26  
      27      $ ./ctype_compatibility.py -h
      28      … prints usage message …
      29  '''
      30  
      31  import sys
      32  import re
      33  import unicodedata
      34  import argparse
      35  
      36  from ctype_compatibility_test_cases import TEST_CASES
      37  
      38  def get_lines_from_file(filename):
      39      '''Get all non-comment lines from a i18n file
      40  
      41      Also merge all lines which are continued on the next line because
      42      they end in “/” into a single line.
      43      '''
      44      with open(filename) as i18n_file:
      45          current_line = ''
      46          for line in i18n_file:
      47              line = line.strip('\n')
      48              if '%' in line:
      49                  if line.endswith('/'):
      50                      line = line[0:line.find('%')] + '/'
      51                  else:
      52                      line = line[0:line.find('%')]
      53              line = line.strip()
      54              if line.endswith('/'):
      55                  current_line += line[:-1]
      56              else:
      57                  yield current_line + line
      58                  current_line = ''
      59      if current_line: # file ends with a continuation line
      60          yield current_line
      61  
      62  def extract_character_classes(filename):
      63      '''Get all Unicode code points for each character class from a file
      64  
      65      Store these code points in a dictionary using the character classes
      66      as keys and the list of code points in this character class as values.
      67  
      68      In case  of the character classes “toupper”, “tolower”, and “totitle”,
      69      these area actually pairs of code points
      70      '''
      71      ctype_dict = {}
      72      for line in get_lines_from_file(filename):
      73          for char_class in [
      74                  'upper',
      75                  'lower',
      76                  'alpha',
      77                  'digit',
      78                  'outdigit',
      79                  'space',
      80                  'cntrl',
      81                  'punct',
      82                  'graph',
      83                  'print',
      84                  'xdigit',
      85                  'blank',
      86                  'combining',
      87                  'combining_level3',
      88                  'toupper',
      89                  'tolower',
      90                  'totitle']:
      91              match = re.match(r'^('
      92                               +'(?:(?:class|map)\s+")'
      93                               +re.escape(char_class)+
      94                               '(?:";)\s+'
      95                               +'|'
      96                               +re.escape(char_class)+'\s+'
      97                               +')', line)
      98              if match:
      99                  if char_class not in ctype_dict:
     100                      ctype_dict[char_class] = []
     101                  process_chars(
     102                      ctype_dict[char_class],
     103                      line[match.end():])
     104      return ctype_dict
     105  
     106  def process_chars(char_class_list, code_point_line):
     107      '''
     108      Extract Unicode values from code_point_line
     109      and add to the list of code points in a character class
     110      '''
     111      for code_points in code_point_line.split(';'):
     112          code_points = code_points.strip()
     113          match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
     114          if match: # <Uxxxx>
     115              char_class_list.append(
     116                  int(match.group('codepoint'), 16))
     117              continue
     118          match = re.match(
     119              r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
     120              +'\.\.'+
     121              '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
     122              code_points)
     123          if match: # <Uxxxx>..<Uxxxx>
     124              for codepoint in range(
     125                      int(match.group('codepoint1'), 16),
     126                      int(match.group('codepoint2'), 16) + 1):
     127                  char_class_list.append(codepoint)
     128              continue
     129          match = re.match(
     130              r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
     131              +'\.\.\(2\)\.\.'+
     132              '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
     133              code_points)
     134          if match: # <Uxxxx>..(2)..<Uxxxx>
     135              for codepoint in range(
     136                      int(match.group('codepoint1'), 16),
     137                      int(match.group('codepoint2'), 16) + 1,
     138                      2):
     139                  char_class_list.append(codepoint)
     140              continue
     141          match = re.match(
     142              r'^\('
     143              +'<U(?P<codepoint1>[0-9A-F]{4,8})>'
     144              +','+
     145              '<U(?P<codepoint2>[0-9A-F]{4,8})>'
     146              +'\)$',
     147              code_points)
     148          if match: # (<Uxxxx>,<Uxxxx>)
     149              char_class_list.append((
     150                  int(match.group('codepoint1'), 16),
     151                  int(match.group('codepoint2'), 16)))
     152              continue
     153          sys.stderr.write(
     154              ('None of the regexps matched '
     155               + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
     156              'cp': code_points,
     157              'cpl': code_point_line
     158          })
     159          exit(1)
     160  
     161  def compare_lists(old_ctype_dict, new_ctype_dict):
     162      '''Compare character classes in the old and the new LC_CTYPE'''
     163      print('****************************************************')
     164      print('Character classes which are only in the new '
     165            + 'or only in the old file:')
     166      for char_class in sorted(old_ctype_dict):
     167          if char_class not in new_ctype_dict:
     168              print('Character class %s is in old ctype but not in new ctype'
     169                    %char_class)
     170      for char_class in sorted(new_ctype_dict):
     171          if char_class not in old_ctype_dict:
     172              print('Character class %s is in new ctype but not in old ctype'
     173                    %char_class)
     174      for char_class in sorted(old_ctype_dict):
     175          print("****************************************************")
     176          print("%s: %d chars in old ctype and %d chars in new ctype" %(
     177              char_class,
     178              len(old_ctype_dict[char_class]),
     179              len(new_ctype_dict[char_class])))
     180          print("----------------------------------------------------")
     181          report(char_class,
     182                 old_ctype_dict[char_class],
     183                 new_ctype_dict[char_class])
     184  
     185  def report_code_points(char_class, code_point_list, text=''):
     186      '''Report all code points which have been added to or removed from a
     187      character class.
     188      '''
     189      for code_point in sorted(code_point_list):
     190          if type(code_point) == type(int()):
     191              print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
     192                    %{'text': text,
     193                      'char': chr(code_point),
     194                      'char_class': char_class,
     195                      'code_point': hex(code_point),
     196                      'name': unicodedata.name(chr(code_point), 'name unknown')})
     197          else:
     198              print(('%(char_class)s: %(text)s: '
     199                     + '%(char0)s → %(char1)s '
     200                     + '%(code_point0)s → %(code_point1)s '
     201                     + '%(name0)s → %(name1)s') %{
     202                  'text': text,
     203                  'char_class': char_class,
     204                  'char0': chr(code_point[0]),
     205                  'code_point0': hex(code_point[0]),
     206                  'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
     207                  'char1': chr(code_point[1]),
     208                  'code_point1': hex(code_point[1]),
     209                  'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
     210              })
     211  
     212  def report(char_class, old_list, new_list):
     213      '''Report the differences for a certain LC_CTYPE character class
     214      between the old and the newly generated state
     215      '''
     216      missing_chars = list(set(old_list)-set(new_list))
     217      print(('%(char_class)s: Missing %(number)d characters '
     218             + 'of old ctype in new ctype ')
     219            %{'char_class': char_class, 'number': len(missing_chars)})
     220      if ARGS.show_missing_characters:
     221          report_code_points(char_class, missing_chars, 'Missing')
     222      added_chars = list(set(new_list)-set(old_list))
     223      print(('%(char_class)s: Added %(number)d characters '
     224             + 'in new ctype which were not in old ctype')
     225            %{'char_class': char_class, 'number': len(added_chars)})
     226      if ARGS.show_added_characters:
     227          report_code_points(char_class, added_chars, 'Added')
     228  
     229  
     230  def cperror(error_message, errorcounter=0):
     231      '''Increase number of errors by one and print an error message'''
     232      print(error_message)
     233      return errorcounter + 1
     234  
     235  def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
     236              errorcounter=0):
     237      '''The parameter “code_point_list_with_ranges” is a list of
     238      integers or pairs of integers, for example:
     239  
     240      [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
     241  
     242      where the pairs of integers stand for all the code points in the range
     243      of the two integers given, including the two integers of the pair.
     244  
     245      '''
     246      for code_point_range in code_point_list_with_ranges:
     247          for code_point in ([code_point_range]
     248                             if type(code_point_range) == type(int())
     249                             else range(code_point_range[0],
     250                                        code_point_range[1]+1)):
     251              for char_class_tuple in char_classes:
     252                  char_class = char_class_tuple[0]
     253                  in_char_class = char_class_tuple[1]
     254                  if (code_point in ctype_dict[char_class]) != in_char_class:
     255                      errorcounter = cperror(
     256                          ('error: %(code_point)s %(char)s '
     257                           + '%(char_class)s %(in)s: %(reason)s') %{
     258                               'code_point': hex(code_point),
     259                               'char': chr(code_point),
     260                               'char_class': char_class,
     261                               'in': not in_char_class,
     262                               'reason': reason},
     263                          errorcounter)
     264      return errorcounter
     265  
     266  def tests(ctype_dict, errorcounter = 0):
     267      '''Test a LC_CTYPE character class dictionary for known errors'''
     268      # copy the information from ctype_dict (which contains lists) in
     269      # a new dictionary ctype_dict2 (which contains dictionaries).
     270      # The checks below are easier with that type of data structure.
     271  
     272      ctype_dict2 = {}
     273      for key in ctype_dict:
     274          ctype_dict2[key] = {}
     275          if ctype_dict[key]:
     276              if type(ctype_dict[key][0]) == type(int()):
     277                  for value in ctype_dict[key]:
     278                      ctype_dict2[key][value] = 1
     279              else: # key is 'toupper', 'tolower', or 'totitle'
     280                  for value in ctype_dict[key]:
     281                      ctype_dict2[key][value[0]] = value[1]
     282  
     283      for test_case in TEST_CASES:
     284          errorcounter = cpcheck(ctype_dict2,
     285                                 test_case[0],
     286                                 test_case[1],
     287                                 test_case[2],
     288                                 errorcounter = errorcounter)
     289  
     290      for code_point in range(0, 0x110000):
     291          # toupper restriction: "Only characters specified for the keywords
     292  	# lower and upper shall be specified.
     293          if (code_point in ctype_dict2['toupper']
     294              and code_point != ctype_dict2['toupper'][code_point]
     295              and not (code_point in ctype_dict2['lower']
     296                       or code_point in ctype_dict2['upper'])):
     297              errorcounter = cperror(
     298                  ('error: %(char1)s is not upper|lower '
     299                   + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
     300                       'char1': chr(code_point),
     301                       'cp1': hex(code_point),
     302                       'cp2': hex(ctype_dict2['toupper'][code_point]),
     303                       'char2': chr(ctype_dict2['toupper'][code_point])
     304                   },
     305                  errorcounter)
     306          # tolower restriction: "Only characters specified for the keywords
     307  	# lower and upper shall be specified.
     308          if (code_point in ctype_dict2['tolower']
     309              and code_point != ctype_dict2['tolower'][code_point]
     310              and not (code_point in ctype_dict2['lower']
     311                       or code_point in ctype_dict2['upper'])):
     312              errorcounter = cperror(
     313                  ('error: %(char1)s is not upper|lower '
     314                   + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
     315                       'char1': chr(code_point),
     316                       'cp1': hex(code_point),
     317                       'cp2': hex(ctype_dict2['tolower'][code_point]),
     318                       'char2': chr(ctype_dict2['tolower'][code_point])
     319                   },
     320                  errorcounter)
     321          # alpha restriction: "Characters classified as either upper or lower
     322  	# shall automatically belong to this class.
     323          if ((code_point in ctype_dict2['lower']
     324               or code_point in ctype_dict2['upper'])
     325              and code_point not in ctype_dict2['alpha']):
     326              errorcounter = cperror(
     327                  'error: %(char)s %(cp)s is upper|lower but not alpha' %{
     328                      'char': chr(code_point),
     329                      'cp': hex(code_point)
     330                  },
     331                  errorcounter)
     332          # alpha restriction: "No character specified for the keywords cntrl,
     333  	# digit, punct or space shall be specified."
     334          if (code_point in ctype_dict2['alpha']
     335              and code_point in ctype_dict2['cntrl']):
     336              errorcounter = cperror(
     337                  'error: %(char)s %(cp)s is alpha and cntrl' %{
     338                      'char': chr(code_point),
     339                      'cp': hex(code_point)
     340                  },
     341                  errorcounter)
     342          if (code_point in ctype_dict2['alpha']
     343              and code_point in ctype_dict2['digit']):
     344              errorcounter = cperror(
     345                  'error: %(char)s %(cp)s is alpha and digit' %{
     346                      'char': chr(code_point),
     347                      'cp': hex(code_point)
     348                  },
     349                  errorcounter)
     350          if (code_point in ctype_dict2['alpha']
     351              and code_point in ctype_dict2['punct']):
     352              errorcounter = cperror(
     353                  'error: %(char)s %(cp)s is alpha and punct' %{
     354                      'char': chr(code_point),
     355                      'cp': hex(code_point)
     356                  },
     357                  errorcounter)
     358          if (code_point in ctype_dict2['alpha']
     359              and code_point in ctype_dict2['space']):
     360              errorcounter = cperror(
     361                  'error: %(char)s %(cp)s is alpha and space' %{
     362                      'char': chr(code_point),
     363                      'cp': hex(code_point)
     364                  },
     365                  errorcounter)
     366          # space restriction: "No character specified for the keywords upper,
     367  	# lower, alpha, digit, graph or xdigit shall be specified."
     368  	# upper, lower, alpha already checked above.
     369          if (code_point in ctype_dict2['space']
     370              and code_point in ctype_dict2['digit']):
     371              errorcounter = cperror(
     372                  'error: %(char)s %(cp)s is space and digit' %{
     373                      'char': chr(code_point),
     374                      'cp': hex(code_point)
     375                  },
     376                  errorcounter)
     377          if (code_point in ctype_dict2['space']
     378              and code_point in ctype_dict2['graph']):
     379              errorcounter = cperror(
     380                  'error: %(char)s %(cp)s is space and graph' %{
     381                      'char': chr(code_point),
     382                      'cp': hex(code_point)
     383                  },
     384                  errorcounter)
     385          if (code_point in ctype_dict2['space']
     386              and code_point in ctype_dict2['xdigit']):
     387              errorcounter = cperror(
     388                  'error: %(char)s %(cp)s is space and xdigit' %{
     389                      'char': chr(code_point),
     390                      'cp': hex(code_point)
     391                  },
     392                  errorcounter)
     393          # cntrl restriction: "No character specified for the keywords upper,
     394  	# lower, alpha, digit, punct, graph, print or xdigit shall be
     395  	# specified."  upper, lower, alpha already checked above.
     396          if (code_point in ctype_dict2['cntrl']
     397              and code_point in ctype_dict2['digit']):
     398              errorcounter = cperror(
     399                  'error: %(char)s %(cp)s is cntrl and digit' %{
     400                      'char': chr(code_point),
     401                      'cp': hex(code_point)
     402                  },
     403                  errorcounter)
     404          if (code_point in ctype_dict2['cntrl']
     405              and code_point in ctype_dict2['punct']):
     406              errorcounter = cperror(
     407                  'error: %(char)s %(cp)s is cntrl and punct' %{
     408                      'char': chr(code_point),
     409                      'cp': hex(code_point)
     410                  },
     411                  errorcounter)
     412          if (code_point in ctype_dict2['cntrl']
     413              and code_point in ctype_dict2['graph']):
     414              errorcounter = cperror(
     415                  'error: %(char)s %(cp)s is cntrl and graph' %{
     416                      'char': chr(code_point),
     417                      'cp': hex(code_point)
     418                  },
     419                  errorcounter)
     420          if (code_point in ctype_dict2['cntrl']
     421              and code_point in ctype_dict2['print']):
     422              errorcounter = cperror(
     423                  'error: %(char)s %(cp)s is cntrl and print' %{
     424                      'char': chr(code_point),
     425                      'cp': hex(code_point)
     426                  },
     427                  errorcounter)
     428          if (code_point in ctype_dict2['cntrl']
     429              and code_point in ctype_dict2['xdigit']):
     430              errorcounter = cperror(
     431                  'error: %(char)s %(cp)s is cntrl and xdigit' %{
     432                      'char': chr(code_point),
     433                      'cp': hex(code_point)
     434                  },
     435                  errorcounter)
     436          # punct restriction: "No character specified for the keywords upper,
     437  	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
     438  	# be specified."  upper, lower, alpha, cntrl already checked above.
     439          if (code_point in ctype_dict2['punct']
     440              and code_point in ctype_dict2['digit']):
     441              errorcounter = cperror(
     442                  'error: %(char)s %(cp)s is punct and digit' %{
     443                      'char': chr(code_point),
     444                      'cp': hex(code_point)
     445                  },
     446                  errorcounter)
     447          if (code_point in ctype_dict2['punct']
     448              and code_point in ctype_dict2['xdigit']):
     449              errorcounter = cperror(
     450                  'error: %(char)s %(cp)s is punct and xdigit' %{
     451                      'char': chr(code_point),
     452                      'cp': hex(code_point)
     453                  },
     454                  errorcounter)
     455          if (code_point in ctype_dict2['punct']
     456              and code_point == 0x0020):
     457              errorcounter = cperror(
     458                  'error: %(char)s %(cp)s is punct.' %{
     459                      'char': chr(code_point),
     460                      'cp': hex(code_point)
     461                  },
     462                  errorcounter)
     463          # graph restriction: "No character specified for the keyword cntrl
     464  	# shall be specified."  Already checked above.
     465  
     466          # print restriction: "No character specified for the keyword cntrl
     467  	# shall be specified."  Already checked above.
     468  
     469          # graph - print relation: differ only in the <space> character.
     470  	# How is this possible if there are more than one space character?!
     471  	# I think susv2/xbd/locale.html should speak of "space characters",
     472  	# not "space character".
     473          if (code_point in ctype_dict2['print']
     474              and not (code_point in ctype_dict2['graph']
     475                       or code_point in ctype_dict2['space'])):
     476              errorcounter = cperror(
     477                  'error: %(char)s %(cp)s is print but not graph|space' %{
     478                      'char': chr(code_point),
     479                      'cp': hex(code_point)
     480                  },
     481                  errorcounter)
     482          if (code_point not in ctype_dict2['print']
     483              and (code_point in ctype_dict2['graph']
     484                   or code_point ==  0x0020)):
     485              errorcounter = cperror(
     486                  'error: %(char)s %(cp)s graph|space but not print' %{
     487                      'char': chr(code_point),
     488                      'cp': hex(code_point)
     489                  },
     490                  errorcounter)
     491      return errorcounter
     492  
     493  if __name__ == "__main__":
     494      PARSER = argparse.ArgumentParser(
     495          description='''
     496          Compare the contents of LC_CTYPE in two files and check for errors.
     497          ''')
     498      PARSER.add_argument(
     499          '-o', '--old_ctype_file',
     500          nargs='?',
     501          type=str,
     502          default='i18n',
     503          help='The old ctype file, default: %(default)s')
     504      PARSER.add_argument(
     505          '-n', '--new_ctype_file',
     506          nargs='?',
     507          type=str,
     508          default='unicode-ctype',
     509          help='The new ctype file, default: %(default)s')
     510      PARSER.add_argument(
     511          '-a', '--show_added_characters',
     512          action='store_true',
     513          help=('Show characters which were added to each '
     514                + 'character class in detail.'))
     515      PARSER.add_argument(
     516          '-m', '--show_missing_characters',
     517          action='store_true',
     518          help=('Show characters which were removed from each '
     519                + 'character class in detail.'))
     520      ARGS = PARSER.parse_args()
     521  
     522      OLD_CTYPE_DICT = extract_character_classes(
     523          ARGS.old_ctype_file)
     524      NEW_CTYPE_DICT = extract_character_classes(
     525          ARGS.new_ctype_file)
     526      compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
     527      print('============================================================')
     528      print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
     529      print('------------------------------------------------------------')
     530      NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
     531      print('------------------------------------------------------------')
     532      print('Old file = %s' %ARGS.old_ctype_file)
     533      print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
     534      print('------------------------------------------------------------')
     535      print('============================================================')
     536      print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
     537      print('------------------------------------------------------------')
     538      NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
     539      print('------------------------------------------------------------')
     540      print('New file = %s' %ARGS.new_ctype_file)
     541      print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
     542      print('------------------------------------------------------------')
     543      if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
     544          exit(1)
     545      else:
     546          exit(0)