1  #!/usr/bin/env python3
       2  # Copyright (C) 1998, 1999 Tom Tromey
       3  # Copyright (C) 2001 Red Hat Software
       4  #
       5  # SPDX-License-Identifier: GPL-2.0-or-later
       6  #
       7  # This program is free software; you can redistribute it and/or modify
       8  # it under the terms of the GNU General Public License as published by
       9  # the Free Software Foundation; either version 2, or (at your option)
      10  # any later version.
      11  #
      12  # This program is distributed in the hope that it will be useful,
      13  # but WITHOUT ANY WARRANTY; without even the implied warranty of
      14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15  # GNU General Public License for more details.
      16  #
      17  # You should have received a copy of the GNU General Public License
      18  # along with this program; if not, see <http://www.gnu.org/licenses/>.
      19  
      20  """
      21  gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
      22  See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
      23  Usage:
      24      I consider the output of this program to be unrestricted.
      25      Use it as you will.
      26  """
      27  
      28  import sys
      29  import argparse
      30  
      31  
      32  # Disable line length warnings as wrapping the test templates would be hard
      33  # flake8: noqa: E501
      34  
      35  
      36  def main(argv):
      37      parser = argparse.ArgumentParser(
      38          description="Generate test cases for case mapping from Unicode data"
      39      )
      40      parser.add_argument("UNICODE-VERSION")
      41      parser.add_argument("UnicodeData.txt")
      42      parser.add_argument("SpecialCasing.txt")
      43      args = parser.parse_args(argv[1:])
      44      version = getattr(args, "UNICODE-VERSION")
      45      filename_udata = getattr(args, "UnicodeData.txt")
      46      filename_casing = getattr(args, "SpecialCasing.txt")
      47  
      48      # Names of fields in Unicode data table.
      49      (
      50          CODE,
      51          NAME,
      52          CATEGORY,
      53          COMBINING_CLASSES,
      54          BIDI_CATEGORY,
      55          DECOMPOSITION,
      56          DECIMAL_VALUE,
      57          DIGIT_VALUE,
      58          NUMERIC_VALUE,
      59          MIRRORED,
      60          OLD_NAME,
      61          COMMENT,
      62          UPPER,
      63          LOWER,
      64          TITLE,
      65      ) = range(15)
      66  
      67      # Names of fields in the SpecialCasing table
      68      CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
      69  
      70      upper = {}
      71      title = {}
      72      lower = {}
      73  
      74      def make_hex(codes):
      75          """Converts a string of white space separated code points encoded as
      76          hex values to a Unicode string. Any extra white space is ignored.
      77          """
      78          return "".join([chr(int(c, 16)) for c in codes.split()])
      79  
      80      def process_one(code, fields):
      81          type_ = fields[CATEGORY]
      82          if type_ == "Ll":
      83              upper[code] = make_hex(fields[UPPER])
      84              lower[code] = chr(code)
      85              title[code] = make_hex(fields[TITLE])
      86          elif type_ == "Lu":
      87              lower[code] = make_hex(fields[LOWER])
      88              upper[code] = chr(code)
      89              title[code] = make_hex(fields[TITLE])
      90          elif type_ == "Lt":
      91              upper[code] = make_hex(fields[UPPER])
      92              lower[code] = make_hex(fields[LOWER])
      93              title[code] = make_hex(fields[LOWER])
      94  
      95      with open(filename_udata, encoding="utf-8") as fileobj:
      96          last_code = -1
      97          for line in fileobj:
      98              line = line.strip()
      99              fields = [f.strip() for f in line.split(";")]
     100              if len(fields) != 15:
     101                  raise SystemExit(
     102                      "Entry for %s has wrong number of fields (%d)"
     103                      % (fields[CODE], len(fields))
     104                  )
     105  
     106              code = int(fields[CODE], 16)
     107  
     108              if code > last_code + 1:
     109                  # Found a gap
     110                  if fields[NAME].endswith("Last>"):
     111                      # Fill the gap with the last character read,
     112                      # since this was a range specified in the char database
     113                      gfields = fields
     114                  else:
     115                      # The gap represents undefined characters.  Only the type
     116                      # matters.
     117                      gfields = [
     118                          "",
     119                          "",
     120                          "Cn",
     121                          "0",
     122                          "",
     123                          "",
     124                          "",
     125                          "",
     126                          "",
     127                          "",
     128                          "",
     129                          "",
     130                          "",
     131                          "",
     132                          "",
     133                      ]
     134  
     135                  last_code += 1
     136                  while last_code < code:
     137                      gfields[CODE] = "%04x" % last_code
     138                      process_one(last_code, gfields)
     139                      last_code += 1
     140  
     141              process_one(code, fields)
     142              last_code = code
     143  
     144      with open(filename_casing, encoding="utf-8") as fileobj:
     145          last_code = -1
     146          for line in fileobj:
     147              # strip comments and skip empty lines
     148              line = line.split("#", 1)[0].strip()
     149              if not line:
     150                  continue
     151  
     152              # all lines end with ";" so just remove it
     153              line = line.rstrip(";").rstrip()
     154              fields = [f.strip() for f in line.split(";")]
     155              if len(fields) not in (4, 5):
     156                  raise SystemExit(
     157                      "Entry for %s has wrong number of fields (%d)"
     158                      % (fields[CASE_CODE], len(fields))
     159                  )
     160  
     161              if len(fields) == 5:
     162                  # Ignore conditional special cases - we'll handle them manually
     163                  continue
     164  
     165              code = int(fields[CASE_CODE], 16)
     166  
     167              upper[code] = make_hex(fields[CASE_UPPER])
     168              lower[code] = make_hex(fields[CASE_LOWER])
     169              title[code] = make_hex(fields[CASE_TITLE])
     170  
     171      print_tests(version, upper, title, lower)
     172  
     173  
     174  def print_tests(version, upper, title, lower):
     175      print(
     176          """\
     177  # Test cases generated from Unicode {} data
     178  # by gen-casemap-txt.py. Do not edit.
     179  #
     180  # Some special hand crafted tests
     181  #
     182  tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
     183  tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
     184  tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
     185  tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
     186  tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
     187  tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
     188  # Test reordering of YPOGEGRAMMENI across other accents
     189  \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
     190  \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
     191  # Handling of final and nonfinal sigma
     192  \tΜΆΙΟΣ 	μάιος 	Μάιος 	ΜΆΙΟΣ \t
     193  \tΜΆΙΟΣ	μάιος	Μάιος	ΜΆΙΟΣ\t
     194  \tΣΙΓΜΑ	σιγμα	Σιγμα	ΣΙΓΜΑ\t
     195  # Lithuanian rule of i followed by letter with dot. Not at all sure
     196  # about the titlecase part here
     197  lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
     198  lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
     199  lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
     200  lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
     201  lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
     202  lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
     203  lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
     204  lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
     205  lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
     206  lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
     207  lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
     208  lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
     209  lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
     210  lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
     211  lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
     212  lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
     213  lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
     214  lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
     215  lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
     216  lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
     217  lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
     218  lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
     219  # Special case not at initial position
     220  \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
     221  #
     222  # Now the automatic tests
     223  #""".format(
     224              version
     225          )
     226      )
     227  
     228      for i in range(0x10FFFF):
     229          if i == 0x3A3:
     230              # Greek sigma needs special tests
     231              continue
     232  
     233          up = upper.get(i, "")
     234          lo = lower.get(i, "")
     235          ti = title.get(i, "")
     236  
     237          if any([up, lo, ti]):
     238              print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
     239  
     240  
     241  if __name__ == "__main__":
     242      sys.exit(main(sys.argv))