python (3.11.7)

(root)/
lib/
python3.11/
test/
test_ucn.py
       1  """ Test script for the Unicode implementation.
       2  
       3  Written by Bill Tutt.
       4  Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
       5  
       6  (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       7  
       8  """#"
       9  
      10  import ast
      11  import unittest
      12  import unicodedata
      13  
      14  from test import support
      15  from http.client import HTTPException
      16  
      17  try:
      18      from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
      19  except ImportError:
      20      INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
      21  
      22  class ESC[4;38;5;81mUnicodeNamesTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      23  
      24      def checkletter(self, name, code):
      25          # Helper that put all \N escapes inside eval'd raw strings,
      26          # to make sure this script runs even if the compiler
      27          # chokes on \N escapes
      28          res = ast.literal_eval(r'"\N{%s}"' % name)
      29          self.assertEqual(res, code)
      30          return res
      31  
      32      def test_general(self):
      33          # General and case insensitivity test:
      34          chars = [
      35              "LATIN CAPITAL LETTER T",
      36              "LATIN SMALL LETTER H",
      37              "LATIN SMALL LETTER E",
      38              "SPACE",
      39              "LATIN SMALL LETTER R",
      40              "LATIN CAPITAL LETTER E",
      41              "LATIN SMALL LETTER D",
      42              "SPACE",
      43              "LATIN SMALL LETTER f",
      44              "LATIN CAPITAL LeTtEr o",
      45              "LATIN SMaLl LETTER x",
      46              "SPACE",
      47              "LATIN SMALL LETTER A",
      48              "LATIN SMALL LETTER T",
      49              "LATIN SMALL LETTER E",
      50              "SPACE",
      51              "LATIN SMALL LETTER T",
      52              "LATIN SMALL LETTER H",
      53              "LATIN SMALL LETTER E",
      54              "SpAcE",
      55              "LATIN SMALL LETTER S",
      56              "LATIN SMALL LETTER H",
      57              "LATIN small LETTER e",
      58              "LATIN small LETTER e",
      59              "LATIN SMALL LETTER P",
      60              "FULL STOP"
      61          ]
      62          string = "The rEd fOx ate the sheep."
      63  
      64          self.assertEqual(
      65              "".join([self.checkletter(*args) for args in zip(chars, string)]),
      66              string
      67          )
      68  
      69      def test_ascii_letters(self):
      70          for char in "".join(map(chr, range(ord("a"), ord("z")))):
      71              name = "LATIN SMALL LETTER %s" % char.upper()
      72              code = unicodedata.lookup(name)
      73              self.assertEqual(unicodedata.name(code), name)
      74  
      75      def test_hangul_syllables(self):
      76          self.checkletter("HANGUL SYLLABLE GA", "\uac00")
      77          self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
      78          self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
      79          self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
      80          self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
      81          self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
      82          self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
      83          self.checkletter("HANGUL SYLLABLE YI", "\uc758")
      84          self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
      85          self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
      86          self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
      87          self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
      88          self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
      89  
      90          self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
      91  
      92      def test_cjk_unified_ideographs(self):
      93          self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
      94          self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
      95          self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
      96          self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
      97          self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
      98          self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
      99          self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
     100          self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
     101          self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
     102          self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
     103          self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
     104  
     105      def test_bmp_characters(self):
     106          for code in range(0x10000):
     107              char = chr(code)
     108              name = unicodedata.name(char, None)
     109              if name is not None:
     110                  self.assertEqual(unicodedata.lookup(name), char)
     111  
     112      def test_misc_symbols(self):
     113          self.checkletter("PILCROW SIGN", "\u00b6")
     114          self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
     115          self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
     116          self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
     117  
     118      def test_aliases(self):
     119          # Check that the aliases defined in the NameAliases.txt file work.
     120          # This should be updated when new aliases are added or the file
     121          # should be downloaded and parsed instead.  See #12753.
     122          aliases = [
     123              ('LATIN CAPITAL LETTER GHA', 0x01A2),
     124              ('LATIN SMALL LETTER GHA', 0x01A3),
     125              ('KANNADA LETTER LLLA', 0x0CDE),
     126              ('LAO LETTER FO FON', 0x0E9D),
     127              ('LAO LETTER FO FAY', 0x0E9F),
     128              ('LAO LETTER RO', 0x0EA3),
     129              ('LAO LETTER LO', 0x0EA5),
     130              ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
     131              ('YI SYLLABLE ITERATION MARK', 0xA015),
     132              ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
     133              ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
     134          ]
     135          for alias, codepoint in aliases:
     136              self.checkletter(alias, chr(codepoint))
     137              name = unicodedata.name(chr(codepoint))
     138              self.assertNotEqual(name, alias)
     139              self.assertEqual(unicodedata.lookup(alias),
     140                               unicodedata.lookup(name))
     141              with self.assertRaises(KeyError):
     142                  unicodedata.ucd_3_2_0.lookup(alias)
     143  
     144      def test_aliases_names_in_pua_range(self):
     145          # We are storing aliases in the PUA 15, but their names shouldn't leak
     146          for cp in range(0xf0000, 0xf0100):
     147              with self.assertRaises(ValueError) as cm:
     148                  unicodedata.name(chr(cp))
     149              self.assertEqual(str(cm.exception), 'no such name')
     150  
     151      def test_named_sequences_names_in_pua_range(self):
     152          # We are storing named seq in the PUA 15, but their names shouldn't leak
     153          for cp in range(0xf0100, 0xf0fff):
     154              with self.assertRaises(ValueError) as cm:
     155                  unicodedata.name(chr(cp))
     156              self.assertEqual(str(cm.exception), 'no such name')
     157  
     158      def test_named_sequences_sample(self):
     159          # Check a few named sequences.  See #12753.
     160          sequences = [
     161              ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
     162              ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
     163              ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
     164              ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
     165              ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
     166          ]
     167          for seqname, codepoints in sequences:
     168              self.assertEqual(unicodedata.lookup(seqname), codepoints)
     169              with self.assertRaises(SyntaxError):
     170                  self.checkletter(seqname, None)
     171              with self.assertRaises(KeyError):
     172                  unicodedata.ucd_3_2_0.lookup(seqname)
     173  
     174      def test_named_sequences_full(self):
     175          # Check all the named sequences
     176          def check_version(testfile):
     177              hdr = testfile.readline()
     178              return unicodedata.unidata_version in hdr
     179          url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
     180                 unicodedata.unidata_version)
     181          try:
     182              testdata = support.open_urlresource(url, encoding="utf-8",
     183                                                  check=check_version)
     184          except (OSError, HTTPException):
     185              self.skipTest("Could not retrieve " + url)
     186          self.addCleanup(testdata.close)
     187          for line in testdata:
     188              line = line.strip()
     189              if not line or line.startswith('#'):
     190                  continue
     191              seqname, codepoints = line.split(';')
     192              codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
     193              self.assertEqual(unicodedata.lookup(seqname), codepoints)
     194              with self.assertRaises(SyntaxError):
     195                  self.checkletter(seqname, None)
     196              with self.assertRaises(KeyError):
     197                  unicodedata.ucd_3_2_0.lookup(seqname)
     198  
     199      def test_errors(self):
     200          self.assertRaises(TypeError, unicodedata.name)
     201          self.assertRaises(TypeError, unicodedata.name, 'xx')
     202          self.assertRaises(TypeError, unicodedata.lookup)
     203          self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
     204  
     205      def test_strict_error_handling(self):
     206          # bogus character name
     207          self.assertRaises(
     208              UnicodeError,
     209              str, b"\\N{blah}", 'unicode-escape', 'strict'
     210          )
     211          # long bogus character name
     212          self.assertRaises(
     213              UnicodeError,
     214              str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
     215          )
     216          # missing closing brace
     217          self.assertRaises(
     218              UnicodeError,
     219              str, b"\\N{SPACE", 'unicode-escape', 'strict'
     220          )
     221          # missing opening brace
     222          self.assertRaises(
     223              UnicodeError,
     224              str, b"\\NSPACE", 'unicode-escape', 'strict'
     225          )
     226  
     227      @support.cpython_only
     228      @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
     229      @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
     230      def test_issue16335(self, size):
     231          # very very long bogus character name
     232          x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
     233          self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
     234          self.assertRaisesRegex(UnicodeError,
     235              'unknown Unicode character name',
     236              x.decode, 'unicode-escape'
     237          )
     238  
     239  
     240  if __name__ == "__main__":
     241      unittest.main()