1  """ Tests for the unicodedata module.
       2  
       3      Written by Marc-Andre Lemburg (mal@lemburg.com).
       4  
       5      (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       6  
       7  """
       8  
       9  import hashlib
      10  from http.client import HTTPException
      11  import sys
      12  import unicodedata
      13  import unittest
      14  from test.support import (open_urlresource, requires_resource, script_helper,
      15                            cpython_only, check_disallow_instantiation)
      16  
      17  
      18  class ESC[4;38;5;81mUnicodeMethodsTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      19  
      20      # update this, if the database changes
      21      expectedchecksum = 'e708c31c0d51f758adf475cb7201cf80917362be'
      22  
      23      @requires_resource('cpu')
      24      def test_method_checksum(self):
      25          h = hashlib.sha1()
      26          for i in range(sys.maxunicode + 1):
      27              char = chr(i)
      28              data = [
      29                  # Predicates (single char)
      30                  "01"[char.isalnum()],
      31                  "01"[char.isalpha()],
      32                  "01"[char.isdecimal()],
      33                  "01"[char.isdigit()],
      34                  "01"[char.islower()],
      35                  "01"[char.isnumeric()],
      36                  "01"[char.isspace()],
      37                  "01"[char.istitle()],
      38                  "01"[char.isupper()],
      39  
      40                  # Predicates (multiple chars)
      41                  "01"[(char + 'abc').isalnum()],
      42                  "01"[(char + 'abc').isalpha()],
      43                  "01"[(char + '123').isdecimal()],
      44                  "01"[(char + '123').isdigit()],
      45                  "01"[(char + 'abc').islower()],
      46                  "01"[(char + '123').isnumeric()],
      47                  "01"[(char + ' \t').isspace()],
      48                  "01"[(char + 'abc').istitle()],
      49                  "01"[(char + 'ABC').isupper()],
      50  
      51                  # Mappings (single char)
      52                  char.lower(),
      53                  char.upper(),
      54                  char.title(),
      55  
      56                  # Mappings (multiple chars)
      57                  (char + 'abc').lower(),
      58                  (char + 'ABC').upper(),
      59                  (char + 'abc').title(),
      60                  (char + 'ABC').title(),
      61  
      62                  ]
      63              h.update(''.join(data).encode('utf-8', 'surrogatepass'))
      64          result = h.hexdigest()
      65          self.assertEqual(result, self.expectedchecksum)
      66  
      67  class ESC[4;38;5;81mUnicodeDatabaseTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      68      db = unicodedata
      69  
      70  class ESC[4;38;5;81mUnicodeFunctionsTest(ESC[4;38;5;149mUnicodeDatabaseTest):
      71  
      72      # Update this if the database changes. Make sure to do a full rebuild
      73      # (e.g. 'make distclean && make') to get the correct checksum.
      74      expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
      75  
      76      @requires_resource('cpu')
      77      def test_function_checksum(self):
      78          data = []
      79          h = hashlib.sha1()
      80  
      81          for i in range(sys.maxunicode + 1):
      82              char = chr(i)
      83              data = [
      84                  # Properties
      85                  format(self.db.digit(char, -1), '.12g'),
      86                  format(self.db.numeric(char, -1), '.12g'),
      87                  format(self.db.decimal(char, -1), '.12g'),
      88                  self.db.category(char),
      89                  self.db.bidirectional(char),
      90                  self.db.decomposition(char),
      91                  str(self.db.mirrored(char)),
      92                  str(self.db.combining(char)),
      93                  unicodedata.east_asian_width(char),
      94                  self.db.name(char, ""),
      95              ]
      96              h.update(''.join(data).encode("ascii"))
      97          result = h.hexdigest()
      98          self.assertEqual(result, self.expectedchecksum)
      99  
     100      @requires_resource('cpu')
     101      def test_name_inverse_lookup(self):
     102          for i in range(sys.maxunicode + 1):
     103              char = chr(i)
     104              if looked_name := self.db.name(char, None):
     105                  self.assertEqual(self.db.lookup(looked_name), char)
     106  
     107      def test_digit(self):
     108          self.assertEqual(self.db.digit('A', None), None)
     109          self.assertEqual(self.db.digit('9'), 9)
     110          self.assertEqual(self.db.digit('\u215b', None), None)
     111          self.assertEqual(self.db.digit('\u2468'), 9)
     112          self.assertEqual(self.db.digit('\U00020000', None), None)
     113          self.assertEqual(self.db.digit('\U0001D7FD'), 7)
     114  
     115          self.assertRaises(TypeError, self.db.digit)
     116          self.assertRaises(TypeError, self.db.digit, 'xx')
     117          self.assertRaises(ValueError, self.db.digit, 'x')
     118  
     119      def test_numeric(self):
     120          self.assertEqual(self.db.numeric('A',None), None)
     121          self.assertEqual(self.db.numeric('9'), 9)
     122          self.assertEqual(self.db.numeric('\u215b'), 0.125)
     123          self.assertEqual(self.db.numeric('\u2468'), 9.0)
     124          self.assertEqual(self.db.numeric('\ua627'), 7.0)
     125          self.assertEqual(self.db.numeric('\U00020000', None), None)
     126          self.assertEqual(self.db.numeric('\U0001012A'), 9000)
     127  
     128          self.assertRaises(TypeError, self.db.numeric)
     129          self.assertRaises(TypeError, self.db.numeric, 'xx')
     130          self.assertRaises(ValueError, self.db.numeric, 'x')
     131  
     132      def test_decimal(self):
     133          self.assertEqual(self.db.decimal('A',None), None)
     134          self.assertEqual(self.db.decimal('9'), 9)
     135          self.assertEqual(self.db.decimal('\u215b', None), None)
     136          self.assertEqual(self.db.decimal('\u2468', None), None)
     137          self.assertEqual(self.db.decimal('\U00020000', None), None)
     138          self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
     139  
     140          self.assertRaises(TypeError, self.db.decimal)
     141          self.assertRaises(TypeError, self.db.decimal, 'xx')
     142          self.assertRaises(ValueError, self.db.decimal, 'x')
     143  
     144      def test_category(self):
     145          self.assertEqual(self.db.category('\uFFFE'), 'Cn')
     146          self.assertEqual(self.db.category('a'), 'Ll')
     147          self.assertEqual(self.db.category('A'), 'Lu')
     148          self.assertEqual(self.db.category('\U00020000'), 'Lo')
     149          self.assertEqual(self.db.category('\U0001012A'), 'No')
     150  
     151          self.assertRaises(TypeError, self.db.category)
     152          self.assertRaises(TypeError, self.db.category, 'xx')
     153  
     154      def test_bidirectional(self):
     155          self.assertEqual(self.db.bidirectional('\uFFFE'), '')
     156          self.assertEqual(self.db.bidirectional(' '), 'WS')
     157          self.assertEqual(self.db.bidirectional('A'), 'L')
     158          self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
     159  
     160          self.assertRaises(TypeError, self.db.bidirectional)
     161          self.assertRaises(TypeError, self.db.bidirectional, 'xx')
     162  
     163      def test_decomposition(self):
     164          self.assertEqual(self.db.decomposition('\uFFFE'),'')
     165          self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
     166  
     167          self.assertRaises(TypeError, self.db.decomposition)
     168          self.assertRaises(TypeError, self.db.decomposition, 'xx')
     169  
     170      def test_mirrored(self):
     171          self.assertEqual(self.db.mirrored('\uFFFE'), 0)
     172          self.assertEqual(self.db.mirrored('a'), 0)
     173          self.assertEqual(self.db.mirrored('\u2201'), 1)
     174          self.assertEqual(self.db.mirrored('\U00020000'), 0)
     175  
     176          self.assertRaises(TypeError, self.db.mirrored)
     177          self.assertRaises(TypeError, self.db.mirrored, 'xx')
     178  
     179      def test_combining(self):
     180          self.assertEqual(self.db.combining('\uFFFE'), 0)
     181          self.assertEqual(self.db.combining('a'), 0)
     182          self.assertEqual(self.db.combining('\u20e1'), 230)
     183          self.assertEqual(self.db.combining('\U00020000'), 0)
     184  
     185          self.assertRaises(TypeError, self.db.combining)
     186          self.assertRaises(TypeError, self.db.combining, 'xx')
     187  
     188      def test_pr29(self):
     189          # https://www.unicode.org/review/pr-29.html
     190          # See issues #1054943 and #10254.
     191          composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
     192                      'Li\u030dt-s\u1e73\u0301',
     193                      '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
     194                      + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
     195                      '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
     196                      + '\u0938\u094d\u0924\u093e\u0928')
     197          for text in composed:
     198              self.assertEqual(self.db.normalize('NFC', text), text)
     199  
     200      def test_issue10254(self):
     201          # Crash reported in #10254
     202          a = 'C\u0338' * 20  + 'C\u0327'
     203          b = 'C\u0338' * 20  + '\xC7'
     204          self.assertEqual(self.db.normalize('NFC', a), b)
     205  
     206      def test_issue29456(self):
     207          # Fix #29456
     208          u1176_str_a = '\u1100\u1176\u11a8'
     209          u1176_str_b = '\u1100\u1176\u11a8'
     210          u11a7_str_a = '\u1100\u1175\u11a7'
     211          u11a7_str_b = '\uae30\u11a7'
     212          u11c3_str_a = '\u1100\u1175\u11c3'
     213          u11c3_str_b = '\uae30\u11c3'
     214          self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
     215          self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
     216          self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
     217  
     218      def test_east_asian_width(self):
     219          eaw = self.db.east_asian_width
     220          self.assertRaises(TypeError, eaw, b'a')
     221          self.assertRaises(TypeError, eaw, bytearray())
     222          self.assertRaises(TypeError, eaw, '')
     223          self.assertRaises(TypeError, eaw, 'ra')
     224          self.assertEqual(eaw('\x1e'), 'N')
     225          self.assertEqual(eaw('\x20'), 'Na')
     226          self.assertEqual(eaw('\uC894'), 'W')
     227          self.assertEqual(eaw('\uFF66'), 'H')
     228          self.assertEqual(eaw('\uFF1F'), 'F')
     229          self.assertEqual(eaw('\u2010'), 'A')
     230          self.assertEqual(eaw('\U00020000'), 'W')
     231  
     232      def test_east_asian_width_unassigned(self):
     233          eaw = self.db.east_asian_width
     234          # unassigned
     235          for char in '\u0530\u0ecf\u10c6\u20fc\uaaca\U000107bd\U000115f2':
     236              self.assertEqual(eaw(char), 'N')
     237              self.assertIs(self.db.name(char, None), None)
     238  
     239          # unassigned but reserved for CJK
     240          for char in '\uFA6E\uFADA\U0002A6E0\U0002FA20\U0003134B\U0003FFFD':
     241              self.assertEqual(eaw(char), 'W')
     242              self.assertIs(self.db.name(char, None), None)
     243  
     244          # private use areas
     245          for char in '\uE000\uF800\U000F0000\U000FFFEE\U00100000\U0010FFF0':
     246              self.assertEqual(eaw(char), 'A')
     247              self.assertIs(self.db.name(char, None), None)
     248  
     249      def test_east_asian_width_9_0_changes(self):
     250          self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
     251          self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
     252  
     253  class ESC[4;38;5;81mUnicodeMiscTest(ESC[4;38;5;149mUnicodeDatabaseTest):
     254  
     255      @cpython_only
     256      def test_disallow_instantiation(self):
     257          # Ensure that the type disallows instantiation (bpo-43916)
     258          check_disallow_instantiation(self, unicodedata.UCD)
     259  
     260      def test_failed_import_during_compiling(self):
     261          # Issue 4367
     262          # Decoding \N escapes requires the unicodedata module. If it can't be
     263          # imported, we shouldn't segfault.
     264  
     265          # This program should raise a SyntaxError in the eval.
     266          code = "import sys;" \
     267              "sys.modules['unicodedata'] = None;" \
     268              """eval("'\\\\N{SOFT HYPHEN}'")"""
     269          # We use a separate process because the unicodedata module may already
     270          # have been loaded in this process.
     271          result = script_helper.assert_python_failure("-c", code)
     272          error = "SyntaxError: (unicode error) \\N escapes not supported " \
     273              "(can't load unicodedata module)"
     274          self.assertIn(error, result.err.decode("ascii"))
     275  
     276      def test_decimal_numeric_consistent(self):
     277          # Test that decimal and numeric are consistent,
     278          # i.e. if a character has a decimal value,
     279          # its numeric value should be the same.
     280          count = 0
     281          for i in range(0x10000):
     282              c = chr(i)
     283              dec = self.db.decimal(c, -1)
     284              if dec != -1:
     285                  self.assertEqual(dec, self.db.numeric(c))
     286                  count += 1
     287          self.assertTrue(count >= 10) # should have tested at least the ASCII digits
     288  
     289      def test_digit_numeric_consistent(self):
     290          # Test that digit and numeric are consistent,
     291          # i.e. if a character has a digit value,
     292          # its numeric value should be the same.
     293          count = 0
     294          for i in range(0x10000):
     295              c = chr(i)
     296              dec = self.db.digit(c, -1)
     297              if dec != -1:
     298                  self.assertEqual(dec, self.db.numeric(c))
     299                  count += 1
     300          self.assertTrue(count >= 10) # should have tested at least the ASCII digits
     301  
     302      def test_bug_1704793(self):
     303          self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
     304  
     305      def test_ucd_510(self):
     306          import unicodedata
     307          # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
     308          self.assertTrue(unicodedata.mirrored("\u0f3a"))
     309          self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
     310          # Also, we now have two ways of representing
     311          # the upper-case mapping: as delta, or as absolute value
     312          self.assertTrue("a".upper()=='A')
     313          self.assertTrue("\u1d79".upper()=='\ua77d')
     314          self.assertTrue(".".upper()=='.')
     315  
     316      @requires_resource('cpu')
     317      def test_bug_5828(self):
     318          self.assertEqual("\u1d79".lower(), "\u1d79")
     319          # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
     320          self.assertEqual(
     321              [
     322                  c for c in range(sys.maxunicode+1)
     323                  if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
     324              ],
     325              [0]
     326          )
     327  
     328      def test_bug_4971(self):
     329          # LETTER DZ WITH CARON: DZ, Dz, dz
     330          self.assertEqual("\u01c4".title(), "\u01c5")
     331          self.assertEqual("\u01c5".title(), "\u01c5")
     332          self.assertEqual("\u01c6".title(), "\u01c5")
     333  
     334      def test_linebreak_7643(self):
     335          for i in range(0x10000):
     336              lines = (chr(i) + 'A').splitlines()
     337              if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
     338                       0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
     339                  self.assertEqual(len(lines), 2,
     340                                   r"\u%.4x should be a linebreak" % i)
     341              else:
     342                  self.assertEqual(len(lines), 1,
     343                                   r"\u%.4x should not be a linebreak" % i)
     344  
     345  class ESC[4;38;5;81mNormalizationTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     346      @staticmethod
     347      def check_version(testfile):
     348          hdr = testfile.readline()
     349          return unicodedata.unidata_version in hdr
     350  
     351      @staticmethod
     352      def unistr(data):
     353          data = [int(x, 16) for x in data.split(" ")]
     354          return "".join([chr(x) for x in data])
     355  
     356      @requires_resource('network')
     357      @requires_resource('cpu')
     358      def test_normalization(self):
     359          TESTDATAFILE = "NormalizationTest.txt"
     360          TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
     361  
     362          # Hit the exception early
     363          try:
     364              testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
     365                                          check=self.check_version)
     366          except PermissionError:
     367              self.skipTest(f"Permission error when downloading {TESTDATAURL} "
     368                            f"into the test data directory")
     369          except (OSError, HTTPException) as exc:
     370              self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
     371  
     372          with testdata:
     373              self.run_normalization_tests(testdata)
     374  
     375      def run_normalization_tests(self, testdata):
     376          part = None
     377          part1_data = {}
     378  
     379          def NFC(str):
     380              return unicodedata.normalize("NFC", str)
     381  
     382          def NFKC(str):
     383              return unicodedata.normalize("NFKC", str)
     384  
     385          def NFD(str):
     386              return unicodedata.normalize("NFD", str)
     387  
     388          def NFKD(str):
     389              return unicodedata.normalize("NFKD", str)
     390  
     391          for line in testdata:
     392              if '#' in line:
     393                  line = line.split('#')[0]
     394              line = line.strip()
     395              if not line:
     396                  continue
     397              if line.startswith("@Part"):
     398                  part = line.split()[0]
     399                  continue
     400              c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
     401  
     402              # Perform tests
     403              self.assertTrue(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
     404              self.assertTrue(c4 ==  NFC(c4) ==  NFC(c5), line)
     405              self.assertTrue(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
     406              self.assertTrue(c5 ==  NFD(c4) ==  NFD(c5), line)
     407              self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
     408                              NFKC(c3) == NFKC(c4) == NFKC(c5),
     409                              line)
     410              self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
     411                              NFKD(c3) == NFKD(c4) == NFKD(c5),
     412                              line)
     413  
     414              self.assertTrue(unicodedata.is_normalized("NFC", c2))
     415              self.assertTrue(unicodedata.is_normalized("NFC", c4))
     416  
     417              self.assertTrue(unicodedata.is_normalized("NFD", c3))
     418              self.assertTrue(unicodedata.is_normalized("NFD", c5))
     419  
     420              self.assertTrue(unicodedata.is_normalized("NFKC", c4))
     421              self.assertTrue(unicodedata.is_normalized("NFKD", c5))
     422  
     423              # Record part 1 data
     424              if part == "@Part1":
     425                  part1_data[c1] = 1
     426  
     427          # Perform tests for all other data
     428          for c in range(sys.maxunicode+1):
     429              X = chr(c)
     430              if X in part1_data:
     431                  continue
     432              self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
     433  
     434      def test_edge_cases(self):
     435          self.assertRaises(TypeError, unicodedata.normalize)
     436          self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
     437          self.assertEqual(unicodedata.normalize('NFKC', ''), '')
     438  
     439      def test_bug_834676(self):
     440          # Check for bug 834676
     441          unicodedata.normalize('NFC', '\ud55c\uae00')
     442  
     443  
     444  if __name__ == "__main__":
     445      unittest.main()