python (3.11.7)
       1  """ Tests for the unicodedata module.
       2  
       3      Written by Marc-Andre Lemburg (mal@lemburg.com).
       4  
       5      (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       6  
       7  """
       8  
       9  import hashlib
      10  from http.client import HTTPException
      11  import sys
      12  import unicodedata
      13  import unittest
      14  from test.support import (open_urlresource, requires_resource, script_helper,
      15                            cpython_only, check_disallow_instantiation,
      16                            ResourceDenied)
      17  
      18  
      19  class ESC[4;38;5;81mUnicodeMethodsTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      20  
      21      # update this, if the database changes
      22      expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326'
      23  
      24      @requires_resource('cpu')
      25      def test_method_checksum(self):
      26          h = hashlib.sha1()
      27          for i in range(sys.maxunicode + 1):
      28              char = chr(i)
      29              data = [
      30                  # Predicates (single char)
      31                  "01"[char.isalnum()],
      32                  "01"[char.isalpha()],
      33                  "01"[char.isdecimal()],
      34                  "01"[char.isdigit()],
      35                  "01"[char.islower()],
      36                  "01"[char.isnumeric()],
      37                  "01"[char.isspace()],
      38                  "01"[char.istitle()],
      39                  "01"[char.isupper()],
      40  
      41                  # Predicates (multiple chars)
      42                  "01"[(char + 'abc').isalnum()],
      43                  "01"[(char + 'abc').isalpha()],
      44                  "01"[(char + '123').isdecimal()],
      45                  "01"[(char + '123').isdigit()],
      46                  "01"[(char + 'abc').islower()],
      47                  "01"[(char + '123').isnumeric()],
      48                  "01"[(char + ' \t').isspace()],
      49                  "01"[(char + 'abc').istitle()],
      50                  "01"[(char + 'ABC').isupper()],
      51  
      52                  # Mappings (single char)
      53                  char.lower(),
      54                  char.upper(),
      55                  char.title(),
      56  
      57                  # Mappings (multiple chars)
      58                  (char + 'abc').lower(),
      59                  (char + 'ABC').upper(),
      60                  (char + 'abc').title(),
      61                  (char + 'ABC').title(),
      62  
      63                  ]
      64              h.update(''.join(data).encode('utf-8', 'surrogatepass'))
      65          result = h.hexdigest()
      66          self.assertEqual(result, self.expectedchecksum)
      67  
      68  class ESC[4;38;5;81mUnicodeDatabaseTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      69      db = unicodedata
      70  
      71  class ESC[4;38;5;81mUnicodeFunctionsTest(ESC[4;38;5;149mUnicodeDatabaseTest):
      72  
      73      # Update this if the database changes. Make sure to do a full rebuild
      74      # (e.g. 'make distclean && make') to get the correct checksum.
      75      expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
      76  
      77      @requires_resource('cpu')
      78      def test_function_checksum(self):
      79          data = []
      80          h = hashlib.sha1()
      81  
      82          for i in range(sys.maxunicode + 1):
      83              char = chr(i)
      84              data = [
      85                  # Properties
      86                  format(self.db.digit(char, -1), '.12g'),
      87                  format(self.db.numeric(char, -1), '.12g'),
      88                  format(self.db.decimal(char, -1), '.12g'),
      89                  self.db.category(char),
      90                  self.db.bidirectional(char),
      91                  self.db.decomposition(char),
      92                  str(self.db.mirrored(char)),
      93                  str(self.db.combining(char)),
      94              ]
      95              h.update(''.join(data).encode("ascii"))
      96          result = h.hexdigest()
      97          self.assertEqual(result, self.expectedchecksum)
      98  
      99      @requires_resource('cpu')
     100      def test_name_inverse_lookup(self):
     101          for i in range(sys.maxunicode + 1):
     102              char = chr(i)
     103              if looked_name := self.db.name(char, None):
     104                  self.assertEqual(self.db.lookup(looked_name), char)
     105  
     106      def test_digit(self):
     107          self.assertEqual(self.db.digit('A', None), None)
     108          self.assertEqual(self.db.digit('9'), 9)
     109          self.assertEqual(self.db.digit('\u215b', None), None)
     110          self.assertEqual(self.db.digit('\u2468'), 9)
     111          self.assertEqual(self.db.digit('\U00020000', None), None)
     112          self.assertEqual(self.db.digit('\U0001D7FD'), 7)
     113  
     114          self.assertRaises(TypeError, self.db.digit)
     115          self.assertRaises(TypeError, self.db.digit, 'xx')
     116          self.assertRaises(ValueError, self.db.digit, 'x')
     117  
     118      def test_numeric(self):
     119          self.assertEqual(self.db.numeric('A',None), None)
     120          self.assertEqual(self.db.numeric('9'), 9)
     121          self.assertEqual(self.db.numeric('\u215b'), 0.125)
     122          self.assertEqual(self.db.numeric('\u2468'), 9.0)
     123          self.assertEqual(self.db.numeric('\ua627'), 7.0)
     124          self.assertEqual(self.db.numeric('\U00020000', None), None)
     125          self.assertEqual(self.db.numeric('\U0001012A'), 9000)
     126  
     127          self.assertRaises(TypeError, self.db.numeric)
     128          self.assertRaises(TypeError, self.db.numeric, 'xx')
     129          self.assertRaises(ValueError, self.db.numeric, 'x')
     130  
     131      def test_decimal(self):
     132          self.assertEqual(self.db.decimal('A',None), None)
     133          self.assertEqual(self.db.decimal('9'), 9)
     134          self.assertEqual(self.db.decimal('\u215b', None), None)
     135          self.assertEqual(self.db.decimal('\u2468', None), None)
     136          self.assertEqual(self.db.decimal('\U00020000', None), None)
     137          self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
     138  
     139          self.assertRaises(TypeError, self.db.decimal)
     140          self.assertRaises(TypeError, self.db.decimal, 'xx')
     141          self.assertRaises(ValueError, self.db.decimal, 'x')
     142  
     143      def test_category(self):
     144          self.assertEqual(self.db.category('\uFFFE'), 'Cn')
     145          self.assertEqual(self.db.category('a'), 'Ll')
     146          self.assertEqual(self.db.category('A'), 'Lu')
     147          self.assertEqual(self.db.category('\U00020000'), 'Lo')
     148          self.assertEqual(self.db.category('\U0001012A'), 'No')
     149  
     150          self.assertRaises(TypeError, self.db.category)
     151          self.assertRaises(TypeError, self.db.category, 'xx')
     152  
     153      def test_bidirectional(self):
     154          self.assertEqual(self.db.bidirectional('\uFFFE'), '')
     155          self.assertEqual(self.db.bidirectional(' '), 'WS')
     156          self.assertEqual(self.db.bidirectional('A'), 'L')
     157          self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
     158  
     159          self.assertRaises(TypeError, self.db.bidirectional)
     160          self.assertRaises(TypeError, self.db.bidirectional, 'xx')
     161  
     162      def test_decomposition(self):
     163          self.assertEqual(self.db.decomposition('\uFFFE'),'')
     164          self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
     165  
     166          self.assertRaises(TypeError, self.db.decomposition)
     167          self.assertRaises(TypeError, self.db.decomposition, 'xx')
     168  
     169      def test_mirrored(self):
     170          self.assertEqual(self.db.mirrored('\uFFFE'), 0)
     171          self.assertEqual(self.db.mirrored('a'), 0)
     172          self.assertEqual(self.db.mirrored('\u2201'), 1)
     173          self.assertEqual(self.db.mirrored('\U00020000'), 0)
     174  
     175          self.assertRaises(TypeError, self.db.mirrored)
     176          self.assertRaises(TypeError, self.db.mirrored, 'xx')
     177  
     178      def test_combining(self):
     179          self.assertEqual(self.db.combining('\uFFFE'), 0)
     180          self.assertEqual(self.db.combining('a'), 0)
     181          self.assertEqual(self.db.combining('\u20e1'), 230)
     182          self.assertEqual(self.db.combining('\U00020000'), 0)
     183  
     184          self.assertRaises(TypeError, self.db.combining)
     185          self.assertRaises(TypeError, self.db.combining, 'xx')
     186  
     187      def test_pr29(self):
     188          # https://www.unicode.org/review/pr-29.html
     189          # See issues #1054943 and #10254.
     190          composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
     191                      'Li\u030dt-s\u1e73\u0301',
     192                      '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
     193                      + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
     194                      '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
     195                      + '\u0938\u094d\u0924\u093e\u0928')
     196          for text in composed:
     197              self.assertEqual(self.db.normalize('NFC', text), text)
     198  
     199      def test_issue10254(self):
     200          # Crash reported in #10254
     201          a = 'C\u0338' * 20  + 'C\u0327'
     202          b = 'C\u0338' * 20  + '\xC7'
     203          self.assertEqual(self.db.normalize('NFC', a), b)
     204  
     205      def test_issue29456(self):
     206          # Fix #29456
     207          u1176_str_a = '\u1100\u1176\u11a8'
     208          u1176_str_b = '\u1100\u1176\u11a8'
     209          u11a7_str_a = '\u1100\u1175\u11a7'
     210          u11a7_str_b = '\uae30\u11a7'
     211          u11c3_str_a = '\u1100\u1175\u11c3'
     212          u11c3_str_b = '\uae30\u11c3'
     213          self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
     214          self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
     215          self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
     216  
     217      def test_east_asian_width(self):
     218          eaw = self.db.east_asian_width
     219          self.assertRaises(TypeError, eaw, b'a')
     220          self.assertRaises(TypeError, eaw, bytearray())
     221          self.assertRaises(TypeError, eaw, '')
     222          self.assertRaises(TypeError, eaw, 'ra')
     223          self.assertEqual(eaw('\x1e'), 'N')
     224          self.assertEqual(eaw('\x20'), 'Na')
     225          self.assertEqual(eaw('\uC894'), 'W')
     226          self.assertEqual(eaw('\uFF66'), 'H')
     227          self.assertEqual(eaw('\uFF1F'), 'F')
     228          self.assertEqual(eaw('\u2010'), 'A')
     229          self.assertEqual(eaw('\U00020000'), 'W')
     230  
     231      def test_east_asian_width_9_0_changes(self):
     232          self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
     233          self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
     234  
     235  class ESC[4;38;5;81mUnicodeMiscTest(ESC[4;38;5;149mUnicodeDatabaseTest):
     236  
     237      @cpython_only
     238      def test_disallow_instantiation(self):
     239          # Ensure that the type disallows instantiation (bpo-43916)
     240          check_disallow_instantiation(self, unicodedata.UCD)
     241  
     242      def test_failed_import_during_compiling(self):
     243          # Issue 4367
     244          # Decoding \N escapes requires the unicodedata module. If it can't be
     245          # imported, we shouldn't segfault.
     246  
     247          # This program should raise a SyntaxError in the eval.
     248          code = "import sys;" \
     249              "sys.modules['unicodedata'] = None;" \
     250              """eval("'\\\\N{SOFT HYPHEN}'")"""
     251          # We use a separate process because the unicodedata module may already
     252          # have been loaded in this process.
     253          result = script_helper.assert_python_failure("-c", code)
     254          error = "SyntaxError: (unicode error) \\N escapes not supported " \
     255              "(can't load unicodedata module)"
     256          self.assertIn(error, result.err.decode("ascii"))
     257  
     258      def test_decimal_numeric_consistent(self):
     259          # Test that decimal and numeric are consistent,
     260          # i.e. if a character has a decimal value,
     261          # its numeric value should be the same.
     262          count = 0
     263          for i in range(0x10000):
     264              c = chr(i)
     265              dec = self.db.decimal(c, -1)
     266              if dec != -1:
     267                  self.assertEqual(dec, self.db.numeric(c))
     268                  count += 1
     269          self.assertTrue(count >= 10) # should have tested at least the ASCII digits
     270  
     271      def test_digit_numeric_consistent(self):
     272          # Test that digit and numeric are consistent,
     273          # i.e. if a character has a digit value,
     274          # its numeric value should be the same.
     275          count = 0
     276          for i in range(0x10000):
     277              c = chr(i)
     278              dec = self.db.digit(c, -1)
     279              if dec != -1:
     280                  self.assertEqual(dec, self.db.numeric(c))
     281                  count += 1
     282          self.assertTrue(count >= 10) # should have tested at least the ASCII digits
     283  
     284      def test_bug_1704793(self):
     285          self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
     286  
     287      def test_ucd_510(self):
     288          import unicodedata
     289          # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
     290          self.assertTrue(unicodedata.mirrored("\u0f3a"))
     291          self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
     292          # Also, we now have two ways of representing
     293          # the upper-case mapping: as delta, or as absolute value
     294          self.assertTrue("a".upper()=='A')
     295          self.assertTrue("\u1d79".upper()=='\ua77d')
     296          self.assertTrue(".".upper()=='.')
     297  
     298      @requires_resource('cpu')
     299      def test_bug_5828(self):
     300          self.assertEqual("\u1d79".lower(), "\u1d79")
     301          # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
     302          self.assertEqual(
     303              [
     304                  c for c in range(sys.maxunicode+1)
     305                  if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
     306              ],
     307              [0]
     308          )
     309  
     310      def test_bug_4971(self):
     311          # LETTER DZ WITH CARON: DZ, Dz, dz
     312          self.assertEqual("\u01c4".title(), "\u01c5")
     313          self.assertEqual("\u01c5".title(), "\u01c5")
     314          self.assertEqual("\u01c6".title(), "\u01c5")
     315  
     316      def test_linebreak_7643(self):
     317          for i in range(0x10000):
     318              lines = (chr(i) + 'A').splitlines()
     319              if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
     320                       0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
     321                  self.assertEqual(len(lines), 2,
     322                                   r"\u%.4x should be a linebreak" % i)
     323              else:
     324                  self.assertEqual(len(lines), 1,
     325                                   r"\u%.4x should not be a linebreak" % i)
     326  
     327  class ESC[4;38;5;81mNormalizationTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     328      @staticmethod
     329      def check_version(testfile):
     330          hdr = testfile.readline()
     331          return unicodedata.unidata_version in hdr
     332  
     333      @staticmethod
     334      def unistr(data):
     335          data = [int(x, 16) for x in data.split(" ")]
     336          return "".join([chr(x) for x in data])
     337  
     338      @requires_resource('network')
     339      @requires_resource('cpu')
     340      def test_normalization(self):
     341          TESTDATAFILE = "NormalizationTest.txt"
     342          TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
     343  
     344          # Hit the exception early
     345          try:
     346              testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
     347                                          check=self.check_version)
     348          except PermissionError:
     349              self.skipTest(f"Permission error when downloading {TESTDATAURL} "
     350                            f"into the test data directory")
     351          except (OSError, HTTPException) as exc:
     352              self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
     353  
     354          with testdata:
     355              self.run_normalization_tests(testdata)
     356  
     357      def run_normalization_tests(self, testdata):
     358          part = None
     359          part1_data = {}
     360  
     361          def NFC(str):
     362              return unicodedata.normalize("NFC", str)
     363  
     364          def NFKC(str):
     365              return unicodedata.normalize("NFKC", str)
     366  
     367          def NFD(str):
     368              return unicodedata.normalize("NFD", str)
     369  
     370          def NFKD(str):
     371              return unicodedata.normalize("NFKD", str)
     372  
     373          for line in testdata:
     374              if '#' in line:
     375                  line = line.split('#')[0]
     376              line = line.strip()
     377              if not line:
     378                  continue
     379              if line.startswith("@Part"):
     380                  part = line.split()[0]
     381                  continue
     382              c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
     383  
     384              # Perform tests
     385              self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
     386              self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
     387              self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
     388              self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
     389              self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
     390                              NFKC(c3) == NFKC(c4) == NFKC(c5),
     391                              line)
     392              self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
     393                              NFKD(c3) == NFKD(c4) == NFKD(c5),
     394                              line)
     395  
     396              self.assertTrue(unicodedata.is_normalized("NFC", c2))
     397              self.assertTrue(unicodedata.is_normalized("NFC", c4))
     398  
     399              self.assertTrue(unicodedata.is_normalized("NFD", c3))
     400              self.assertTrue(unicodedata.is_normalized("NFD", c5))
     401  
     402              self.assertTrue(unicodedata.is_normalized("NFKC", c4))
     403              self.assertTrue(unicodedata.is_normalized("NFKD", c5))
     404  
     405              # Record part 1 data
     406              if part == "@Part1":
     407                  part1_data[c1] = 1
     408  
     409          # Perform tests for all other data
     410          for c in range(sys.maxunicode+1):
     411              X = chr(c)
     412              if X in part1_data:
     413                  continue
     414              self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
     415  
     416      def test_edge_cases(self):
     417          self.assertRaises(TypeError, unicodedata.normalize)
     418          self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
     419          self.assertEqual(unicodedata.normalize('NFKC', ''), '')
     420  
     421      def test_bug_834676(self):
     422          # Check for bug 834676
     423          unicodedata.normalize('NFC', '\ud55c\uae00')
     424  
     425  
     426  if __name__ == "__main__":
     427      unittest.main()