1  # Utilities to generate Unicode data for glibc from upstream Unicode data.
       2  #
       3  # Copyright (C) 2014-2023 Free Software Foundation, Inc.
       4  # This file is part of the GNU C Library.
       5  #
       6  # The GNU C Library is free software; you can redistribute it and/or
       7  # modify it under the terms of the GNU Lesser General Public
       8  # License as published by the Free Software Foundation; either
       9  # version 2.1 of the License, or (at your option) any later version.
      10  #
      11  # The GNU C Library is distributed in the hope that it will be useful,
      12  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14  # Lesser General Public License for more details.
      15  #
      16  # You should have received a copy of the GNU Lesser General Public
      17  # License along with the GNU C Library; if not, see
      18  # <https://www.gnu.org/licenses/>.
      19  
      20  '''
      21  This module contains utilities used by the scripts to generate
      22  Unicode data for glibc from upstream Unicode data files.
      23  '''
      24  
      25  import sys
      26  import re
      27  
      28  
      29  # Common locale header.
      30  COMMENT_HEADER = """
      31  % This file is part of the GNU C Library and contains locale data.
      32  % The Free Software Foundation does not claim any copyright interest
      33  % in the locale data contained in this file.  The foregoing does not
      34  % affect the license of the GNU C Library as a whole.  It does not
      35  % exempt you from the conditions of the license if your use would
      36  % otherwise be governed by that license.
      37  """
      38  
      39  # Dictionary holding the entire contents of the UnicodeData.txt file
      40  #
      41  # Contents of this dictionary look like this:
      42  #
      43  # {0: {'category': 'Cc',
      44  #      'title': None,
      45  #      'digit': '',
      46  #      'name': '<control>',
      47  #      'bidi': 'BN',
      48  #      'combining': '0',
      49  #      'comment': '',
      50  #      'oldname': 'NULL',
      51  #      'decomposition': '',
      52  #      'upper': None,
      53  #      'mirrored': 'N',
      54  #      'lower': None,
      55  #      'decdigit': '',
      56  #      'numeric': ''},
      57  #      …
      58  # }
      59  UNICODE_ATTRIBUTES = {}
      60  
      61  # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
      62  #
      63  # Contents of this dictionary look like this:
      64  #
      65  # {917504: ['Default_Ignorable_Code_Point'],
      66  #  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
      67  #  …
      68  # }
      69  DERIVED_CORE_PROPERTIES = {}
      70  
      71  # Dictionary holding the entire contents of the EastAsianWidths.txt file
      72  #
      73  # Contents of this dictionary look like this:
      74  #
      75  # {0: 'N', … , 45430: 'W', …}
      76  EAST_ASIAN_WIDTHS = {}
      77  
      78  def fill_attribute(code_point, fields):
      79      '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
      80  
      81      One entry in the UNICODE_ATTRIBUTES dictionary represents one line
      82      in the UnicodeData.txt file.
      83  
      84      '''
      85      UNICODE_ATTRIBUTES[code_point] =  {
      86          'name': fields[1],          # Character name
      87          'category': fields[2],      # General category
      88          'combining': fields[3],     # Canonical combining classes
      89          'bidi': fields[4],          # Bidirectional category
      90          'decomposition': fields[5], # Character decomposition mapping
      91          'decdigit': fields[6],      # Decimal digit value
      92          'digit': fields[7],         # Digit value
      93          'numeric': fields[8],       # Numeric value
      94          'mirrored': fields[9],      # mirrored
      95          'oldname': fields[10],      # Old Unicode 1.0 name
      96          'comment': fields[11],      # comment
      97          # Uppercase mapping
      98          'upper': int(fields[12], 16) if fields[12] else None,
      99          # Lowercase mapping
     100          'lower': int(fields[13], 16) if fields[13] else None,
     101          # Titlecase mapping
     102          'title': int(fields[14], 16) if fields[14] else None,
     103      }
     104  
     105  def fill_attributes(filename):
     106      '''Stores the entire contents of the UnicodeData.txt file
     107      in the UNICODE_ATTRIBUTES dictionary.
     108  
     109      A typical line for a single code point in UnicodeData.txt looks
     110      like this:
     111  
     112      0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
     113  
     114      Code point ranges are indicated by pairs of lines like this:
     115  
     116      4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
     117      9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
     118      '''
     119      with open(filename, mode='r') as unicode_data_file:
     120          fields_start = []
     121          for line in unicode_data_file:
     122              fields = line.strip().split(';')
     123              if len(fields) != 15:
     124                  sys.stderr.write(
     125                      'short line in file "%(f)s": %(l)s\n' %{
     126                      'f': filename, 'l': line})
     127                  exit(1)
     128              if fields[2] == 'Cs':
     129                  # Surrogates are UTF-16 artefacts,
     130                  # not real characters. Ignore them.
     131                  fields_start = []
     132                  continue
     133              if fields[1].endswith(', First>'):
     134                  fields_start = fields
     135                  fields_start[1] = fields_start[1].split(',')[0][1:]
     136                  continue
     137              if fields[1].endswith(', Last>'):
     138                  fields[1] = fields[1].split(',')[0][1:]
     139                  if fields[1:] != fields_start[1:]:
     140                      sys.stderr.write(
     141                          'broken code point range in file "%(f)s": %(l)s\n' %{
     142                              'f': filename, 'l': line})
     143                      exit(1)
     144                  for code_point in range(
     145                          int(fields_start[0], 16),
     146                          int(fields[0], 16)+1):
     147                      fill_attribute(code_point, fields)
     148                  fields_start = []
     149                  continue
     150              fill_attribute(int(fields[0], 16), fields)
     151              fields_start = []
     152  
     153  def fill_derived_core_properties(filename):
     154      '''Stores the entire contents of the DerivedCoreProperties.txt file
     155      in the DERIVED_CORE_PROPERTIES dictionary.
     156  
     157      Lines in DerivedCoreProperties.txt are either a code point range like
     158      this:
     159  
     160      0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
     161  
     162      or a single code point like this:
     163  
     164      00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
     165  
     166      '''
     167      with open(filename, mode='r') as derived_core_properties_file:
     168          for line in derived_core_properties_file:
     169              match = re.match(
     170                  r'^(?P<codepoint1>[0-9A-F]{4,6})'
     171                  + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
     172                  + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
     173                  line)
     174              if not match:
     175                  continue
     176              start = match.group('codepoint1')
     177              end = match.group('codepoint2')
     178              if not end:
     179                  end = start
     180              for code_point in range(int(start, 16), int(end, 16)+1):
     181                  prop = match.group('property')
     182                  if code_point in DERIVED_CORE_PROPERTIES:
     183                      DERIVED_CORE_PROPERTIES[code_point].append(prop)
     184                  else:
     185                      DERIVED_CORE_PROPERTIES[code_point] = [prop]
     186  
     187  def fill_east_asian_widths(filename):
     188      '''Stores the entire contents of the EastAsianWidths.txt file
     189      in the EAST_ASIAN_WIDTHS dictionary.
     190  
     191      Lines in EastAsianWidths.txt are either a code point range like
     192      this:
     193  
     194      9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
     195  
     196      or a single code point like this:
     197  
     198      A015;W           # Lm         YI SYLLABLE WU
     199      '''
     200      with open(filename, mode='r') as east_asian_widths_file:
     201          for line in east_asian_widths_file:
     202              match = re.match(
     203                  r'^(?P<codepoint1>[0-9A-F]{4,6})'
     204                  +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
     205                  +r'\s*;\s*(?P<property>[a-zA-Z]+)',
     206                  line)
     207              if not match:
     208                  continue
     209              start = match.group('codepoint1')
     210              end = match.group('codepoint2')
     211              if not end:
     212                  end = start
     213              for code_point in range(int(start, 16), int(end, 16)+1):
     214                  EAST_ASIAN_WIDTHS[code_point] = match.group('property')
     215  
     216  def to_upper(code_point):
     217      '''Returns the code point of the uppercase version
     218      of the given code point'''
     219      if (UNICODE_ATTRIBUTES[code_point]['name']
     220          and UNICODE_ATTRIBUTES[code_point]['upper']):
     221          return UNICODE_ATTRIBUTES[code_point]['upper']
     222      else:
     223          return code_point
     224  
     225  def to_lower(code_point):
     226      '''Returns the code point of the lowercase version
     227      of the given code point'''
     228      if (UNICODE_ATTRIBUTES[code_point]['name']
     229          and UNICODE_ATTRIBUTES[code_point]['lower']):
     230          return UNICODE_ATTRIBUTES[code_point]['lower']
     231      else:
     232          return code_point
     233  
     234  def to_upper_turkish(code_point):
     235      '''Returns the code point of the Turkish uppercase version
     236      of the given code point'''
     237      if code_point == 0x0069:
     238          return 0x0130
     239      return to_upper(code_point)
     240  
     241  def to_lower_turkish(code_point):
     242      '''Returns the code point of the Turkish lowercase version
     243      of the given code point'''
     244      if code_point == 0x0049:
     245          return 0x0131
     246      return to_lower(code_point)
     247  
     248  def to_title(code_point):
     249      '''Returns the code point of the titlecase version
     250      of the given code point'''
     251      if (UNICODE_ATTRIBUTES[code_point]['name']
     252          and UNICODE_ATTRIBUTES[code_point]['title']):
     253          return UNICODE_ATTRIBUTES[code_point]['title']
     254      else:
     255          return code_point
     256  
     257  def is_upper(code_point):
     258      '''Checks whether the character with this code point is uppercase'''
     259      return (to_lower(code_point) != code_point
     260              or (code_point in DERIVED_CORE_PROPERTIES
     261                  and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
     262  
     263  def is_lower(code_point):
     264      '''Checks whether the character with this code point is lowercase'''
     265      # Some characters are defined as “Lowercase” in
     266      # DerivedCoreProperties.txt but do not have a mapping to upper
     267      # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
     268      # one of these.
     269      return (to_upper(code_point) != code_point
     270              # <U00DF> is lowercase, but without simple to_upper mapping.
     271              or code_point == 0x00DF
     272              or (code_point in DERIVED_CORE_PROPERTIES
     273                  and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
     274  
     275  def is_alpha(code_point):
     276      '''Checks whether the character with this code point is alphabetic'''
     277      return ((code_point in DERIVED_CORE_PROPERTIES
     278               and
     279               'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
     280              or
     281              # Consider all the non-ASCII digits as alphabetic.
     282              # ISO C 99 forbids us to have them in category “digit”,
     283              # but we want iswalnum to return true on them.
     284              (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
     285               and not (code_point >= 0x0030 and code_point <= 0x0039)))
     286  
     287  def is_digit(code_point):
     288      '''Checks whether the character with this code point is a digit'''
     289      if False:
     290          return (UNICODE_ATTRIBUTES[code_point]['name']
     291                  and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
     292          # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
     293          # a zero.  Must add <0> in front of them by hand.
     294      else:
     295          # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
     296          # takes it away:
     297          # 7.25.2.1.5:
     298          #    The iswdigit function tests for any wide character that
     299          #    corresponds to a decimal-digit character (as defined in 5.2.1).
     300          # 5.2.1:
     301          #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
     302          return (code_point >= 0x0030 and code_point <= 0x0039)
     303  
     304  def is_outdigit(code_point):
     305      '''Checks whether the character with this code point is outdigit'''
     306      return (code_point >= 0x0030 and code_point <= 0x0039)
     307  
     308  def is_blank(code_point):
     309      '''Checks whether the character with this code point is blank'''
     310      return (code_point == 0x0009 # '\t'
     311              # Category Zs without mention of '<noBreak>'
     312              or (UNICODE_ATTRIBUTES[code_point]['name']
     313                  and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
     314                  and '<noBreak>' not in
     315                  UNICODE_ATTRIBUTES[code_point]['decomposition']))
     316  
     317  def is_space(code_point):
     318      '''Checks whether the character with this code point is a space'''
     319      # Don’t make U+00A0 a space. Non-breaking space means that all programs
     320      # should treat it like a punctuation character, not like a space.
     321      return (code_point == 0x0020 # ' '
     322              or code_point == 0x000C # '\f'
     323              or code_point == 0x000A # '\n'
     324              or code_point == 0x000D # '\r'
     325              or code_point == 0x0009 # '\t'
     326              or code_point == 0x000B # '\v'
     327              # Categories Zl, Zp, and Zs without mention of "<noBreak>"
     328              or (UNICODE_ATTRIBUTES[code_point]['name']
     329                  and
     330                  (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
     331                   or
     332                   (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
     333                    and
     334                    '<noBreak>' not in
     335                    UNICODE_ATTRIBUTES[code_point]['decomposition']))))
     336  
     337  def is_cntrl(code_point):
     338      '''Checks whether the character with this code point is
     339      a control character'''
     340      return (UNICODE_ATTRIBUTES[code_point]['name']
     341              and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
     342                   or
     343                   UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
     344  
     345  def is_xdigit(code_point):
     346      '''Checks whether the character with this code point is
     347      a hexadecimal digit'''
     348      if False:
     349          return (is_digit(code_point)
     350                  or (code_point >= 0x0041 and code_point <= 0x0046)
     351                  or (code_point >= 0x0061 and code_point <= 0x0066))
     352      else:
     353          # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
     354          # takes it away:
     355          # 7.25.2.1.12:
     356          #    The iswxdigit function tests for any wide character that
     357          #    corresponds to a hexadecimal-digit character (as defined
     358          #    in 6.4.4.1).
     359          # 6.4.4.1:
     360          #    hexadecimal-digit: one of
     361          #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
     362          return ((code_point >= 0x0030 and code_point  <= 0x0039)
     363                  or (code_point >= 0x0041 and code_point <= 0x0046)
     364                  or (code_point >= 0x0061 and code_point <= 0x0066))
     365  
     366  def is_graph(code_point):
     367      '''Checks whether the character with this code point is
     368      a graphical character'''
     369      return (UNICODE_ATTRIBUTES[code_point]['name']
     370              and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
     371              and not is_space(code_point))
     372  
     373  def is_print(code_point):
     374      '''Checks whether the character with this code point is printable'''
     375      return (UNICODE_ATTRIBUTES[code_point]['name']
     376              and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
     377              and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
     378  
     379  def is_punct(code_point):
     380      '''Checks whether the character with this code point is punctuation'''
     381      if False:
     382          return (UNICODE_ATTRIBUTES[code_point]['name']
     383                  and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
     384      else:
     385          # The traditional POSIX definition of punctuation is every graphic,
     386          # non-alphanumeric character.
     387          return (is_graph(code_point)
     388                  and not is_alpha(code_point)
     389                  and not is_digit(code_point))
     390  
     391  def is_combining(code_point):
     392      '''Checks whether the character with this code point is
     393      a combining character'''
     394      # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
     395      # file. In 3.0.1 it was identical to the union of the general categories
     396      # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
     397      # PropList.txt file, so we take the latter definition.
     398      return (UNICODE_ATTRIBUTES[code_point]['name']
     399              and
     400              UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
     401  
     402  def is_combining_level3(code_point):
     403      '''Checks whether the character with this code point is
     404      a combining level3 character'''
     405      return (is_combining(code_point)
     406              and
     407              int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
     408  
     409  def ucs_symbol(code_point):
     410      '''Return the UCS symbol string for a Unicode character.'''
     411      if code_point < 0x10000:
     412          return '<U{:04X}>'.format(code_point)
     413      else:
     414          return '<U{:08X}>'.format(code_point)
     415  
     416  def ucs_symbol_range(code_point_low, code_point_high):
     417      '''Returns a string UCS symbol string for a code point range.
     418  
     419      Example:
     420  
     421      <U0041>..<U005A>
     422      '''
     423      return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
     424  
     425  def verifications():
     426      '''Tests whether the is_* functions observe the known restrictions'''
     427      for code_point in sorted(UNICODE_ATTRIBUTES):
     428          # toupper restriction: "Only characters specified for the keywords
     429          # lower and upper shall be specified.
     430          if (to_upper(code_point) != code_point
     431              and not (is_lower(code_point) or is_upper(code_point))):
     432              sys.stderr.write(
     433                  ('%(sym)s is not upper|lower '
     434                   + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
     435                      'sym': ucs_symbol(code_point),
     436                      'c': code_point,
     437                      'uc': to_upper(code_point)})
     438          # tolower restriction: "Only characters specified for the keywords
     439          # lower and upper shall be specified.
     440          if (to_lower(code_point) != code_point
     441              and not (is_lower(code_point) or is_upper(code_point))):
     442              sys.stderr.write(
     443                  ('%(sym)s is not upper|lower '
     444                   + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
     445                      'sym': ucs_symbol(code_point),
     446                      'c': code_point,
     447                      'uc': to_lower(code_point)})
     448          # alpha restriction: "Characters classified as either upper or lower
     449          # shall automatically belong to this class.
     450          if ((is_lower(code_point) or is_upper(code_point))
     451               and not is_alpha(code_point)):
     452              sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
     453                  'sym': ucs_symbol(code_point)})
     454          # alpha restriction: “No character specified for the keywords cntrl,
     455          # digit, punct or space shall be specified.”
     456          if (is_alpha(code_point) and is_cntrl(code_point)):
     457              sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
     458                  'sym': ucs_symbol(code_point)})
     459          if (is_alpha(code_point) and is_digit(code_point)):
     460              sys.stderr.write('%(sym)s is alpha and digit\n' %{
     461                  'sym': ucs_symbol(code_point)})
     462          if (is_alpha(code_point) and is_punct(code_point)):
     463              sys.stderr.write('%(sym)s is alpha and punct\n' %{
     464                  'sym': ucs_symbol(code_point)})
     465          if (is_alpha(code_point) and is_space(code_point)):
     466              sys.stderr.write('%(sym)s is alpha and space\n' %{
     467                  'sym': ucs_symbol(code_point)})
     468          # space restriction: “No character specified for the keywords upper,
     469          # lower, alpha, digit, graph or xdigit shall be specified.”
     470          # upper, lower, alpha already checked above.
     471          if (is_space(code_point) and is_digit(code_point)):
     472              sys.stderr.write('%(sym)s is space and digit\n' %{
     473                  'sym': ucs_symbol(code_point)})
     474          if (is_space(code_point) and is_graph(code_point)):
     475              sys.stderr.write('%(sym)s is space and graph\n' %{
     476                  'sym': ucs_symbol(code_point)})
     477          if (is_space(code_point) and is_xdigit(code_point)):
     478              sys.stderr.write('%(sym)s is space and xdigit\n' %{
     479                  'sym': ucs_symbol(code_point)})
     480          # cntrl restriction: “No character specified for the keywords upper,
     481          # lower, alpha, digit, punct, graph, print or xdigit shall be
     482          # specified.”  upper, lower, alpha already checked above.
     483          if (is_cntrl(code_point) and is_digit(code_point)):
     484              sys.stderr.write('%(sym)s is cntrl and digit\n' %{
     485                  'sym': ucs_symbol(code_point)})
     486          if (is_cntrl(code_point) and is_punct(code_point)):
     487              sys.stderr.write('%(sym)s is cntrl and punct\n' %{
     488                  'sym': ucs_symbol(code_point)})
     489          if (is_cntrl(code_point) and is_graph(code_point)):
     490              sys.stderr.write('%(sym)s is cntrl and graph\n' %{
     491                  'sym': ucs_symbol(code_point)})
     492          if (is_cntrl(code_point) and is_print(code_point)):
     493              sys.stderr.write('%(sym)s is cntrl and print\n' %{
     494                  'sym': ucs_symbol(code_point)})
     495          if (is_cntrl(code_point) and is_xdigit(code_point)):
     496              sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
     497                  'sym': ucs_symbol(code_point)})
     498          # punct restriction: “No character specified for the keywords upper,
     499          # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
     500          # be specified.”  upper, lower, alpha, cntrl already checked above.
     501          if (is_punct(code_point) and is_digit(code_point)):
     502              sys.stderr.write('%(sym)s is punct and digit\n' %{
     503                  'sym': ucs_symbol(code_point)})
     504          if (is_punct(code_point) and is_xdigit(code_point)):
     505              sys.stderr.write('%(sym)s is punct and xdigit\n' %{
     506                  'sym': ucs_symbol(code_point)})
     507          if (is_punct(code_point) and code_point == 0x0020):
     508              sys.stderr.write('%(sym)s is punct\n' %{
     509                  'sym': ucs_symbol(code_point)})
     510          # graph restriction: “No character specified for the keyword cntrl
     511          # shall be specified.”  Already checked above.
     512  
     513          # print restriction: “No character specified for the keyword cntrl
     514          # shall be specified.”  Already checked above.
     515  
     516          # graph - print relation: differ only in the <space> character.
     517          # How is this possible if there are more than one space character?!
     518          # I think susv2/xbd/locale.html should speak of “space characters”,
     519          # not “space character”.
     520          if (is_print(code_point)
     521              and not (is_graph(code_point) or is_space(code_point))):
     522              sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
     523                  'sym': unicode_utils.ucs_symbol(code_point)})
     524          if (not is_print(code_point)
     525              and (is_graph(code_point) or code_point == 0x0020)):
     526              sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
     527                  'sym': unicode_utils.ucs_symbol(code_point)})