1  import codecs
       2  import html.entities
       3  import itertools
       4  import sys
       5  import unicodedata
       6  import unittest
       7  
       8  
       9  class ESC[4;38;5;81mPosReturn:
      10      # this can be used for configurable callbacks
      11  
      12      def __init__(self):
      13          self.pos = 0
      14  
      15      def handle(self, exc):
      16          oldpos = self.pos
      17          realpos = oldpos
      18          if realpos<0:
      19              realpos = len(exc.object) + realpos
      20          # if we don't advance this time, terminate on the next call
      21          # otherwise we'd get an endless loop
      22          if realpos <= exc.start:
      23              self.pos = len(exc.object)
      24          return ("<?>", oldpos)
      25  
      26  class ESC[4;38;5;81mRepeatedPosReturn:
      27      def __init__(self, repl="<?>"):
      28          self.repl = repl
      29          self.pos = 0
      30          self.count = 0
      31  
      32      def handle(self, exc):
      33          if self.count > 0:
      34              self.count -= 1
      35              return (self.repl, self.pos)
      36          return (self.repl, exc.end)
      37  
      38  # A UnicodeEncodeError object with a bad start attribute
      39  class ESC[4;38;5;81mBadStartUnicodeEncodeError(ESC[4;38;5;149mUnicodeEncodeError):
      40      def __init__(self):
      41          UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
      42          self.start = []
      43  
      44  # A UnicodeEncodeError object with a bad object attribute
      45  class ESC[4;38;5;81mBadObjectUnicodeEncodeError(ESC[4;38;5;149mUnicodeEncodeError):
      46      def __init__(self):
      47          UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
      48          self.object = []
      49  
      50  # A UnicodeDecodeError object without an end attribute
      51  class ESC[4;38;5;81mNoEndUnicodeDecodeError(ESC[4;38;5;149mUnicodeDecodeError):
      52      def __init__(self):
      53          UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
      54          del self.end
      55  
      56  # A UnicodeDecodeError object with a bad object attribute
      57  class ESC[4;38;5;81mBadObjectUnicodeDecodeError(ESC[4;38;5;149mUnicodeDecodeError):
      58      def __init__(self):
      59          UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
      60          self.object = []
      61  
      62  # A UnicodeTranslateError object without a start attribute
      63  class ESC[4;38;5;81mNoStartUnicodeTranslateError(ESC[4;38;5;149mUnicodeTranslateError):
      64      def __init__(self):
      65          UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
      66          del self.start
      67  
      68  # A UnicodeTranslateError object without an end attribute
      69  class ESC[4;38;5;81mNoEndUnicodeTranslateError(ESC[4;38;5;149mUnicodeTranslateError):
      70      def __init__(self):
      71          UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
      72          del self.end
      73  
      74  # A UnicodeTranslateError object without an object attribute
      75  class ESC[4;38;5;81mNoObjectUnicodeTranslateError(ESC[4;38;5;149mUnicodeTranslateError):
      76      def __init__(self):
      77          UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
      78          del self.object
      79  
      80  class ESC[4;38;5;81mCodecCallbackTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      81  
      82      def test_xmlcharrefreplace(self):
      83          # replace unencodable characters which numeric character entities.
      84          # For ascii, latin-1 and charmaps this is completely implemented
      85          # in C and should be reasonably fast.
      86          s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
      87          self.assertEqual(
      88              s.encode("ascii", "xmlcharrefreplace"),
      89              b"&#12473;&#12497;&#12514; &#228;nd eggs"
      90          )
      91          self.assertEqual(
      92              s.encode("latin-1", "xmlcharrefreplace"),
      93              b"&#12473;&#12497;&#12514; \xe4nd eggs"
      94          )
      95  
      96      def test_xmlcharnamereplace(self):
      97          # This time use a named character entity for unencodable
      98          # characters, if one is available.
      99  
     100          def xmlcharnamereplace(exc):
     101              if not isinstance(exc, UnicodeEncodeError):
     102                  raise TypeError("don't know how to handle %r" % exc)
     103              l = []
     104              for c in exc.object[exc.start:exc.end]:
     105                  try:
     106                      l.append("&%s;" % html.entities.codepoint2name[ord(c)])
     107                  except KeyError:
     108                      l.append("&#%d;" % ord(c))
     109              return ("".join(l), exc.end)
     110  
     111          codecs.register_error(
     112              "test.xmlcharnamereplace", xmlcharnamereplace)
     113  
     114          sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
     115          sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
     116          self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
     117          sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
     118          self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
     119          sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
     120          self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
     121  
     122      def test_uninamereplace(self):
     123          # We're using the names from the unicode database this time,
     124          # and we're doing "syntax highlighting" here, i.e. we include
     125          # the replaced text in ANSI escape sequences. For this it is
     126          # useful that the error handler is not called for every single
     127          # unencodable character, but for a complete sequence of
     128          # unencodable characters, otherwise we would output many
     129          # unnecessary escape sequences.
     130  
     131          def uninamereplace(exc):
     132              if not isinstance(exc, UnicodeEncodeError):
     133                  raise TypeError("don't know how to handle %r" % exc)
     134              l = []
     135              for c in exc.object[exc.start:exc.end]:
     136                  l.append(unicodedata.name(c, "0x%x" % ord(c)))
     137              return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
     138  
     139          codecs.register_error(
     140              "test.uninamereplace", uninamereplace)
     141  
     142          sin = "\xac\u1234\u20ac\u8000"
     143          sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
     144          self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
     145  
     146          sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
     147          self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
     148  
     149          sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
     150          self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
     151  
     152      def test_backslashescape(self):
     153          # Does the same as the "unicode-escape" encoding, but with different
     154          # base encodings.
     155          sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
     156          sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
     157          self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
     158  
     159          sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
     160          self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
     161  
     162          sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
     163          self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
     164  
     165      def test_nameescape(self):
     166          # Does the same as backslashescape, but prefers ``\N{...}`` escape
     167          # sequences.
     168          sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
     169          sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
     170                  b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
     171          self.assertEqual(sin.encode("ascii", "namereplace"), sout)
     172  
     173          sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
     174                  b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
     175          self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
     176  
     177          sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
     178                  b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
     179          self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
     180  
     181      def test_decoding_callbacks(self):
     182          # This is a test for a decoding callback handler
     183          # that allows the decoding of the invalid sequence
     184          # "\xc0\x80" and returns "\x00" instead of raising an error.
     185          # All other illegal sequences will be handled strictly.
     186          def relaxedutf8(exc):
     187              if not isinstance(exc, UnicodeDecodeError):
     188                  raise TypeError("don't know how to handle %r" % exc)
     189              if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
     190                  return ("\x00", exc.start+2) # retry after two bytes
     191              else:
     192                  raise exc
     193  
     194          codecs.register_error("test.relaxedutf8", relaxedutf8)
     195  
     196          # all the "\xc0\x80" will be decoded to "\x00"
     197          sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
     198          sout = "a\x00b\x00c\xfc\x00\x00"
     199          self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
     200  
     201          # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
     202          sin = b"\xc0\x80\xc0\x81"
     203          self.assertRaises(UnicodeDecodeError, sin.decode,
     204                            "utf-8", "test.relaxedutf8")
     205  
     206      def test_charmapencode(self):
     207          # For charmap encodings the replacement string will be
     208          # mapped through the encoding again. This means, that
     209          # to be able to use e.g. the "replace" handler, the
     210          # charmap has to have a mapping for "?".
     211          charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
     212          sin = "abc"
     213          sout = b"AABBCC"
     214          self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
     215  
     216          sin = "abcA"
     217          self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
     218  
     219          charmap[ord("?")] = b"XYZ"
     220          sin = "abcDEF"
     221          sout = b"AABBCCXYZXYZXYZ"
     222          self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
     223  
     224          charmap[ord("?")] = "XYZ" # wrong type in mapping
     225          self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
     226  
     227      def test_callbacks(self):
     228          def handler1(exc):
     229              r = range(exc.start, exc.end)
     230              if isinstance(exc, UnicodeEncodeError):
     231                  l = ["<%d>" % ord(exc.object[pos]) for pos in r]
     232              elif isinstance(exc, UnicodeDecodeError):
     233                  l = ["<%d>" % exc.object[pos] for pos in r]
     234              else:
     235                  raise TypeError("don't know how to handle %r" % exc)
     236              return ("[%s]" % "".join(l), exc.end)
     237  
     238          codecs.register_error("test.handler1", handler1)
     239  
     240          def handler2(exc):
     241              if not isinstance(exc, UnicodeDecodeError):
     242                  raise TypeError("don't know how to handle %r" % exc)
     243              l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
     244              return ("[%s]" % "".join(l), exc.end+1) # skip one character
     245  
     246          codecs.register_error("test.handler2", handler2)
     247  
     248          s = b"\x00\x81\x7f\x80\xff"
     249  
     250          self.assertEqual(
     251              s.decode("ascii", "test.handler1"),
     252              "\x00[<129>]\x7f[<128>][<255>]"
     253          )
     254          self.assertEqual(
     255              s.decode("ascii", "test.handler2"),
     256              "\x00[<129>][<128>]"
     257          )
     258  
     259          self.assertEqual(
     260              b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"),
     261              "\u3042[<92><117><51>]xxx"
     262          )
     263  
     264          self.assertEqual(
     265              b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"),
     266              "\u3042[<92><117><51>]xx"
     267          )
     268  
     269          self.assertEqual(
     270              codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
     271              "z[<98>][<99>]"
     272          )
     273  
     274          self.assertEqual(
     275              "g\xfc\xdfrk".encode("ascii", "test.handler1"),
     276              b"g[<252><223>]rk"
     277          )
     278  
     279          self.assertEqual(
     280              "g\xfc\xdf".encode("ascii", "test.handler1"),
     281              b"g[<252><223>]"
     282          )
     283  
     284      def test_longstrings(self):
     285          # test long strings to check for memory overflow problems
     286          errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
     287                     "backslashreplace", "namereplace"]
     288          # register the handlers under different names,
     289          # to prevent the codec from recognizing the name
     290          for err in errors:
     291              codecs.register_error("test." + err, codecs.lookup_error(err))
     292          l = 1000
     293          errors += [ "test." + err for err in errors ]
     294          for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
     295              for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
     296                          "utf-8", "utf-7", "utf-16", "utf-32"):
     297                  for err in errors:
     298                      try:
     299                          uni.encode(enc, err)
     300                      except UnicodeError:
     301                          pass
     302  
     303      def check_exceptionobjectargs(self, exctype, args, msg):
     304          # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
     305          # check with one missing argument
     306          self.assertRaises(TypeError, exctype, *args[:-1])
     307          # check with one argument too much
     308          self.assertRaises(TypeError, exctype, *(args + ["too much"]))
     309          # check with one argument of the wrong type
     310          wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
     311          for i in range(len(args)):
     312              for wrongarg in wrongargs:
     313                  if type(wrongarg) is type(args[i]):
     314                      continue
     315                  # build argument array
     316                  callargs = []
     317                  for j in range(len(args)):
     318                      if i==j:
     319                          callargs.append(wrongarg)
     320                      else:
     321                          callargs.append(args[i])
     322                  self.assertRaises(TypeError, exctype, *callargs)
     323  
     324          # check with the correct number and type of arguments
     325          exc = exctype(*args)
     326          self.assertEqual(str(exc), msg)
     327  
     328      def test_unicodeencodeerror(self):
     329          self.check_exceptionobjectargs(
     330              UnicodeEncodeError,
     331              ["ascii", "g\xfcrk", 1, 2, "ouch"],
     332              "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
     333          )
     334          self.check_exceptionobjectargs(
     335              UnicodeEncodeError,
     336              ["ascii", "g\xfcrk", 1, 4, "ouch"],
     337              "'ascii' codec can't encode characters in position 1-3: ouch"
     338          )
     339          self.check_exceptionobjectargs(
     340              UnicodeEncodeError,
     341              ["ascii", "\xfcx", 0, 1, "ouch"],
     342              "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
     343          )
     344          self.check_exceptionobjectargs(
     345              UnicodeEncodeError,
     346              ["ascii", "\u0100x", 0, 1, "ouch"],
     347              "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
     348          )
     349          self.check_exceptionobjectargs(
     350              UnicodeEncodeError,
     351              ["ascii", "\uffffx", 0, 1, "ouch"],
     352              "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
     353          )
     354          self.check_exceptionobjectargs(
     355              UnicodeEncodeError,
     356              ["ascii", "\U00010000x", 0, 1, "ouch"],
     357              "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
     358          )
     359  
     360      def test_unicodedecodeerror(self):
     361          self.check_exceptionobjectargs(
     362              UnicodeDecodeError,
     363              ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
     364              "'ascii' codec can't decode byte 0xfc in position 1: ouch"
     365          )
     366          self.check_exceptionobjectargs(
     367              UnicodeDecodeError,
     368              ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
     369              "'ascii' codec can't decode bytes in position 1-2: ouch"
     370          )
     371  
     372      def test_unicodetranslateerror(self):
     373          self.check_exceptionobjectargs(
     374              UnicodeTranslateError,
     375              ["g\xfcrk", 1, 2, "ouch"],
     376              "can't translate character '\\xfc' in position 1: ouch"
     377          )
     378          self.check_exceptionobjectargs(
     379              UnicodeTranslateError,
     380              ["g\u0100rk", 1, 2, "ouch"],
     381              "can't translate character '\\u0100' in position 1: ouch"
     382          )
     383          self.check_exceptionobjectargs(
     384              UnicodeTranslateError,
     385              ["g\uffffrk", 1, 2, "ouch"],
     386              "can't translate character '\\uffff' in position 1: ouch"
     387          )
     388          self.check_exceptionobjectargs(
     389              UnicodeTranslateError,
     390              ["g\U00010000rk", 1, 2, "ouch"],
     391              "can't translate character '\\U00010000' in position 1: ouch"
     392          )
     393          self.check_exceptionobjectargs(
     394              UnicodeTranslateError,
     395              ["g\xfcrk", 1, 3, "ouch"],
     396              "can't translate characters in position 1-2: ouch"
     397          )
     398  
     399      def test_badandgoodstrictexceptions(self):
     400          # "strict" complains about a non-exception passed in
     401          self.assertRaises(
     402              TypeError,
     403              codecs.strict_errors,
     404              42
     405          )
     406          # "strict" complains about the wrong exception type
     407          self.assertRaises(
     408              Exception,
     409              codecs.strict_errors,
     410              Exception("ouch")
     411          )
     412  
     413          # If the correct exception is passed in, "strict" raises it
     414          self.assertRaises(
     415              UnicodeEncodeError,
     416              codecs.strict_errors,
     417              UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
     418          )
     419          self.assertRaises(
     420              UnicodeDecodeError,
     421              codecs.strict_errors,
     422              UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
     423          )
     424          self.assertRaises(
     425              UnicodeTranslateError,
     426              codecs.strict_errors,
     427              UnicodeTranslateError("\u3042", 0, 1, "ouch")
     428          )
     429  
     430      def test_badandgoodignoreexceptions(self):
     431          # "ignore" complains about a non-exception passed in
     432          self.assertRaises(
     433             TypeError,
     434             codecs.ignore_errors,
     435             42
     436          )
     437          # "ignore" complains about the wrong exception type
     438          self.assertRaises(
     439             TypeError,
     440             codecs.ignore_errors,
     441             UnicodeError("ouch")
     442          )
     443          # If the correct exception is passed in, "ignore" returns an empty replacement
     444          self.assertEqual(
     445              codecs.ignore_errors(
     446                  UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
     447              ("", 2)
     448          )
     449          self.assertEqual(
     450              codecs.ignore_errors(
     451                  UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
     452              ("", 2)
     453          )
     454          self.assertEqual(
     455              codecs.ignore_errors(
     456                  UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
     457              ("", 2)
     458          )
     459  
     460      def test_badandgoodreplaceexceptions(self):
     461          # "replace" complains about a non-exception passed in
     462          self.assertRaises(
     463             TypeError,
     464             codecs.replace_errors,
     465             42
     466          )
     467          # "replace" complains about the wrong exception type
     468          self.assertRaises(
     469             TypeError,
     470             codecs.replace_errors,
     471             UnicodeError("ouch")
     472          )
     473          self.assertRaises(
     474              TypeError,
     475              codecs.replace_errors,
     476              BadObjectUnicodeEncodeError()
     477          )
     478          self.assertRaises(
     479              TypeError,
     480              codecs.replace_errors,
     481              BadObjectUnicodeDecodeError()
     482          )
     483          # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
     484          self.assertEqual(
     485              codecs.replace_errors(
     486                  UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
     487              ("?", 2)
     488          )
     489          self.assertEqual(
     490              codecs.replace_errors(
     491                  UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
     492              ("\ufffd", 2)
     493          )
     494          self.assertEqual(
     495              codecs.replace_errors(
     496                  UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
     497              ("\ufffd", 2)
     498          )
     499  
     500      def test_badandgoodxmlcharrefreplaceexceptions(self):
     501          # "xmlcharrefreplace" complains about a non-exception passed in
     502          self.assertRaises(
     503             TypeError,
     504             codecs.xmlcharrefreplace_errors,
     505             42
     506          )
     507          # "xmlcharrefreplace" complains about the wrong exception types
     508          self.assertRaises(
     509             TypeError,
     510             codecs.xmlcharrefreplace_errors,
     511             UnicodeError("ouch")
     512          )
     513          # "xmlcharrefreplace" can only be used for encoding
     514          self.assertRaises(
     515              TypeError,
     516              codecs.xmlcharrefreplace_errors,
     517              UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
     518          )
     519          self.assertRaises(
     520              TypeError,
     521              codecs.xmlcharrefreplace_errors,
     522              UnicodeTranslateError("\u3042", 0, 1, "ouch")
     523          )
     524          # Use the correct exception
     525          cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
     526                999999, 1000000)
     527          cs += (0xd800, 0xdfff)
     528          s = "".join(chr(c) for c in cs)
     529          self.assertEqual(
     530              codecs.xmlcharrefreplace_errors(
     531                  UnicodeEncodeError("ascii", "a" + s + "b",
     532                                     1, 1 + len(s), "ouch")
     533              ),
     534              ("".join("&#%d;" % c for c in cs), 1 + len(s))
     535          )
     536  
     537      def test_badandgoodbackslashreplaceexceptions(self):
     538          # "backslashreplace" complains about a non-exception passed in
     539          self.assertRaises(
     540             TypeError,
     541             codecs.backslashreplace_errors,
     542             42
     543          )
     544          # "backslashreplace" complains about the wrong exception types
     545          self.assertRaises(
     546             TypeError,
     547             codecs.backslashreplace_errors,
     548             UnicodeError("ouch")
     549          )
     550          # Use the correct exception
     551          tests = [
     552              ("\u3042", "\\u3042"),
     553              ("\n", "\\x0a"),
     554              ("a", "\\x61"),
     555              ("\x00", "\\x00"),
     556              ("\xff", "\\xff"),
     557              ("\u0100", "\\u0100"),
     558              ("\uffff", "\\uffff"),
     559              ("\U00010000", "\\U00010000"),
     560              ("\U0010ffff", "\\U0010ffff"),
     561              # Lone surrogates
     562              ("\ud800", "\\ud800"),
     563              ("\udfff", "\\udfff"),
     564              ("\ud800\udfff", "\\ud800\\udfff"),
     565          ]
     566          for s, r in tests:
     567              with self.subTest(str=s):
     568                  self.assertEqual(
     569                      codecs.backslashreplace_errors(
     570                          UnicodeEncodeError("ascii", "a" + s + "b",
     571                                             1, 1 + len(s), "ouch")),
     572                      (r, 1 + len(s))
     573                  )
     574                  self.assertEqual(
     575                      codecs.backslashreplace_errors(
     576                          UnicodeTranslateError("a" + s + "b",
     577                                                1, 1 + len(s), "ouch")),
     578                      (r, 1 + len(s))
     579                  )
     580          tests = [
     581              (b"a", "\\x61"),
     582              (b"\n", "\\x0a"),
     583              (b"\x00", "\\x00"),
     584              (b"\xff", "\\xff"),
     585          ]
     586          for b, r in tests:
     587              with self.subTest(bytes=b):
     588                  self.assertEqual(
     589                      codecs.backslashreplace_errors(
     590                          UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
     591                                             1, 2, "ouch")),
     592                      (r, 2)
     593                  )
     594  
     595      def test_badandgoodnamereplaceexceptions(self):
     596          # "namereplace" complains about a non-exception passed in
     597          self.assertRaises(
     598             TypeError,
     599             codecs.namereplace_errors,
     600             42
     601          )
     602          # "namereplace" complains about the wrong exception types
     603          self.assertRaises(
     604             TypeError,
     605             codecs.namereplace_errors,
     606             UnicodeError("ouch")
     607          )
     608          # "namereplace" can only be used for encoding
     609          self.assertRaises(
     610              TypeError,
     611              codecs.namereplace_errors,
     612              UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
     613          )
     614          self.assertRaises(
     615              TypeError,
     616              codecs.namereplace_errors,
     617              UnicodeTranslateError("\u3042", 0, 1, "ouch")
     618          )
     619          # Use the correct exception
     620          tests = [
     621              ("\u3042", "\\N{HIRAGANA LETTER A}"),
     622              ("\x00", "\\x00"),
     623              ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
     624                         "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
     625              ("\U000e007f", "\\N{CANCEL TAG}"),
     626              ("\U0010ffff", "\\U0010ffff"),
     627              # Lone surrogates
     628              ("\ud800", "\\ud800"),
     629              ("\udfff", "\\udfff"),
     630              ("\ud800\udfff", "\\ud800\\udfff"),
     631          ]
     632          for s, r in tests:
     633              with self.subTest(str=s):
     634                  self.assertEqual(
     635                      codecs.namereplace_errors(
     636                          UnicodeEncodeError("ascii", "a" + s + "b",
     637                                             1, 1 + len(s), "ouch")),
     638                      (r, 1 + len(s))
     639                  )
     640  
     641      def test_badandgoodsurrogateescapeexceptions(self):
     642          surrogateescape_errors = codecs.lookup_error('surrogateescape')
     643          # "surrogateescape" complains about a non-exception passed in
     644          self.assertRaises(
     645             TypeError,
     646             surrogateescape_errors,
     647             42
     648          )
     649          # "surrogateescape" complains about the wrong exception types
     650          self.assertRaises(
     651             TypeError,
     652             surrogateescape_errors,
     653             UnicodeError("ouch")
     654          )
     655          # "surrogateescape" can not be used for translating
     656          self.assertRaises(
     657              TypeError,
     658              surrogateescape_errors,
     659              UnicodeTranslateError("\udc80", 0, 1, "ouch")
     660          )
     661          # Use the correct exception
     662          for s in ("a", "\udc7f", "\udd00"):
     663              with self.subTest(str=s):
     664                  self.assertRaises(
     665                      UnicodeEncodeError,
     666                      surrogateescape_errors,
     667                      UnicodeEncodeError("ascii", s, 0, 1, "ouch")
     668                  )
     669          self.assertEqual(
     670              surrogateescape_errors(
     671                  UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
     672              (b"\x80", 2)
     673          )
     674          self.assertRaises(
     675              UnicodeDecodeError,
     676              surrogateescape_errors,
     677              UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
     678          )
     679          self.assertEqual(
     680              surrogateescape_errors(
     681                  UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
     682              ("\udc80", 2)
     683          )
     684  
     685      def test_badandgoodsurrogatepassexceptions(self):
     686          surrogatepass_errors = codecs.lookup_error('surrogatepass')
     687          # "surrogatepass" complains about a non-exception passed in
     688          self.assertRaises(
     689             TypeError,
     690             surrogatepass_errors,
     691             42
     692          )
     693          # "surrogatepass" complains about the wrong exception types
     694          self.assertRaises(
     695             TypeError,
     696             surrogatepass_errors,
     697             UnicodeError("ouch")
     698          )
     699          # "surrogatepass" can not be used for translating
     700          self.assertRaises(
     701              TypeError,
     702              surrogatepass_errors,
     703              UnicodeTranslateError("\ud800", 0, 1, "ouch")
     704          )
     705          # Use the correct exception
     706          for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
     707              with self.subTest(encoding=enc):
     708                  self.assertRaises(
     709                      UnicodeEncodeError,
     710                      surrogatepass_errors,
     711                      UnicodeEncodeError(enc, "a", 0, 1, "ouch")
     712                  )
     713                  self.assertRaises(
     714                      UnicodeDecodeError,
     715                      surrogatepass_errors,
     716                      UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
     717                  )
     718          for s in ("\ud800", "\udfff", "\ud800\udfff"):
     719              with self.subTest(str=s):
     720                  self.assertRaises(
     721                      UnicodeEncodeError,
     722                      surrogatepass_errors,
     723                      UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
     724                  )
     725          tests = [
     726              ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
     727              ("utf-16le", "\ud800", b'\x00\xd8', 2),
     728              ("utf-16be", "\ud800", b'\xd8\x00', 2),
     729              ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
     730              ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
     731              ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
     732              ("utf-16le", "\udfff", b'\xff\xdf', 2),
     733              ("utf-16be", "\udfff", b'\xdf\xff', 2),
     734              ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
     735              ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
     736              ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
     737              ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
     738              ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
     739              ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
     740              ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
     741          ]
     742          for enc, s, b, n in tests:
     743              with self.subTest(encoding=enc, str=s, bytes=b):
     744                  self.assertEqual(
     745                      surrogatepass_errors(
     746                          UnicodeEncodeError(enc, "a" + s + "b",
     747                                             1, 1 + len(s), "ouch")),
     748                      (b, 1 + len(s))
     749                  )
     750                  self.assertEqual(
     751                      surrogatepass_errors(
     752                          UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
     753                                             1, 1 + n, "ouch")),
     754                      (s[:1], 1 + n)
     755                  )
     756  
     757      def test_badhandlerresults(self):
     758          results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
     759          encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
     760  
     761          for res in results:
     762              codecs.register_error("test.badhandler", lambda x: res)
     763              for enc in encs:
     764                  self.assertRaises(
     765                      TypeError,
     766                      "\u3042".encode,
     767                      enc,
     768                      "test.badhandler"
     769                  )
     770              for (enc, bytes) in (
     771                  ("ascii", b"\xff"),
     772                  ("utf-8", b"\xff"),
     773                  ("utf-7", b"+x-"),
     774              ):
     775                  self.assertRaises(
     776                      TypeError,
     777                      bytes.decode,
     778                      enc,
     779                      "test.badhandler"
     780                  )
     781  
     782      def test_lookup(self):
     783          self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
     784          self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
     785          self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
     786          self.assertEqual(
     787              codecs.xmlcharrefreplace_errors,
     788              codecs.lookup_error("xmlcharrefreplace")
     789          )
     790          self.assertEqual(
     791              codecs.backslashreplace_errors,
     792              codecs.lookup_error("backslashreplace")
     793          )
     794          self.assertEqual(
     795              codecs.namereplace_errors,
     796              codecs.lookup_error("namereplace")
     797          )
     798  
     799      def test_encode_nonascii_replacement(self):
     800          def handle(exc):
     801              if isinstance(exc, UnicodeEncodeError):
     802                  return (repl, exc.end)
     803              raise TypeError("don't know how to handle %r" % exc)
     804          codecs.register_error("test.replacing", handle)
     805  
     806          for enc, input, repl in (
     807                  ("ascii", "[¤]", "abc"),
     808                  ("iso-8859-1", "[€]", "½¾"),
     809                  ("iso-8859-15", "[¤]", "œŸ"),
     810          ):
     811              res = input.encode(enc, "test.replacing")
     812              self.assertEqual(res, ("[" + repl + "]").encode(enc))
     813  
     814          for enc, input, repl in (
     815                  ("utf-8", "[\udc80]", "\U0001f40d"),
     816                  ("utf-16", "[\udc80]", "\U0001f40d"),
     817                  ("utf-32", "[\udc80]", "\U0001f40d"),
     818          ):
     819              with self.subTest(encoding=enc):
     820                  with self.assertRaises(UnicodeEncodeError) as cm:
     821                      input.encode(enc, "test.replacing")
     822                  exc = cm.exception
     823                  self.assertEqual(exc.start, 1)
     824                  self.assertEqual(exc.end, 2)
     825                  self.assertEqual(exc.object, input)
     826  
     827      def test_encode_unencodable_replacement(self):
     828          def unencrepl(exc):
     829              if isinstance(exc, UnicodeEncodeError):
     830                  return (repl, exc.end)
     831              else:
     832                  raise TypeError("don't know how to handle %r" % exc)
     833          codecs.register_error("test.unencreplhandler", unencrepl)
     834  
     835          for enc, input, repl in (
     836                  ("ascii", "[¤]", "½"),
     837                  ("iso-8859-1", "[€]", "œ"),
     838                  ("iso-8859-15", "[¤]", "½"),
     839                  ("utf-8", "[\udc80]", "\udcff"),
     840                  ("utf-16", "[\udc80]", "\udcff"),
     841                  ("utf-32", "[\udc80]", "\udcff"),
     842          ):
     843              with self.subTest(encoding=enc):
     844                  with self.assertRaises(UnicodeEncodeError) as cm:
     845                      input.encode(enc, "test.unencreplhandler")
     846                  exc = cm.exception
     847                  self.assertEqual(exc.start, 1)
     848                  self.assertEqual(exc.end, 2)
     849                  self.assertEqual(exc.object, input)
     850  
     851      def test_encode_bytes_replacement(self):
     852          def handle(exc):
     853              if isinstance(exc, UnicodeEncodeError):
     854                  return (repl, exc.end)
     855              raise TypeError("don't know how to handle %r" % exc)
     856          codecs.register_error("test.replacing", handle)
     857  
     858          # It works even if the bytes sequence is not decodable.
     859          for enc, input, repl in (
     860                  ("ascii", "[¤]", b"\xbd\xbe"),
     861                  ("iso-8859-1", "[€]", b"\xbd\xbe"),
     862                  ("iso-8859-15", "[¤]", b"\xbd\xbe"),
     863                  ("utf-8", "[\udc80]", b"\xbd\xbe"),
     864                  ("utf-16le", "[\udc80]", b"\xbd\xbe"),
     865                  ("utf-16be", "[\udc80]", b"\xbd\xbe"),
     866                  ("utf-32le", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
     867                  ("utf-32be", "[\udc80]", b"\xbc\xbd\xbe\xbf"),
     868          ):
     869              with self.subTest(encoding=enc):
     870                  res = input.encode(enc, "test.replacing")
     871                  self.assertEqual(res, "[".encode(enc) + repl + "]".encode(enc))
     872  
     873      def test_encode_odd_bytes_replacement(self):
     874          def handle(exc):
     875              if isinstance(exc, UnicodeEncodeError):
     876                  return (repl, exc.end)
     877              raise TypeError("don't know how to handle %r" % exc)
     878          codecs.register_error("test.replacing", handle)
     879  
     880          input = "[\udc80]"
     881          # Tests in which the replacement bytestring contains not whole number
     882          # of code units.
     883          for enc, repl in (
     884              *itertools.product(("utf-16le", "utf-16be"),
     885                                 [b"a", b"abc"]),
     886              *itertools.product(("utf-32le", "utf-32be"),
     887                                 [b"a", b"ab", b"abc", b"abcde"]),
     888          ):
     889              with self.subTest(encoding=enc, repl=repl):
     890                  with self.assertRaises(UnicodeEncodeError) as cm:
     891                      input.encode(enc, "test.replacing")
     892                  exc = cm.exception
     893                  self.assertEqual(exc.start, 1)
     894                  self.assertEqual(exc.end, 2)
     895                  self.assertEqual(exc.object, input)
     896                  self.assertEqual(exc.reason, "surrogates not allowed")
     897  
     898      def test_badregistercall(self):
     899          # enhance coverage of:
     900          # Modules/_codecsmodule.c::register_error()
     901          # Python/codecs.c::PyCodec_RegisterError()
     902          self.assertRaises(TypeError, codecs.register_error, 42)
     903          self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
     904  
     905      def test_badlookupcall(self):
     906          # enhance coverage of:
     907          # Modules/_codecsmodule.c::lookup_error()
     908          self.assertRaises(TypeError, codecs.lookup_error)
     909  
     910      def test_unknownhandler(self):
     911          # enhance coverage of:
     912          # Modules/_codecsmodule.c::lookup_error()
     913          self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
     914  
     915      def test_xmlcharrefvalues(self):
     916          # enhance coverage of:
     917          # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
     918          # and inline implementations
     919          v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
     920               500000, 1000000)
     921          s = "".join([chr(x) for x in v])
     922          codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
     923          for enc in ("ascii", "iso-8859-15"):
     924              for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
     925                  s.encode(enc, err)
     926  
     927      def test_decodehelper(self):
     928          # enhance coverage of:
     929          # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
     930          # and callers
     931          self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
     932  
     933          def baddecodereturn1(exc):
     934              return 42
     935          codecs.register_error("test.baddecodereturn1", baddecodereturn1)
     936          self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
     937          self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
     938          self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
     939          self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
     940          self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
     941          self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
     942  
     943          def baddecodereturn2(exc):
     944              return ("?", None)
     945          codecs.register_error("test.baddecodereturn2", baddecodereturn2)
     946          self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
     947  
     948          handler = PosReturn()
     949          codecs.register_error("test.posreturn", handler.handle)
     950  
     951          # Valid negative position
     952          handler.pos = -1
     953          self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
     954  
     955          # Valid negative position
     956          handler.pos = -2
     957          self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
     958  
     959          # Negative position out of bounds
     960          handler.pos = -3
     961          self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
     962  
     963          # Valid positive position
     964          handler.pos = 1
     965          self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
     966  
     967          # Largest valid positive position (one beyond end of input)
     968          handler.pos = 2
     969          self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
     970  
     971          # Invalid positive position
     972          handler.pos = 3
     973          self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
     974  
     975          # Restart at the "0"
     976          handler.pos = 6
     977          self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
     978  
     979          class ESC[4;38;5;81mD(ESC[4;38;5;149mdict):
     980              def __getitem__(self, key):
     981                  raise ValueError
     982          self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
     983          self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
     984          self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
     985  
     986      def test_encodehelper(self):
     987          # enhance coverage of:
     988          # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
     989          # and callers
     990          self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
     991  
     992          def badencodereturn1(exc):
     993              return 42
     994          codecs.register_error("test.badencodereturn1", badencodereturn1)
     995          self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
     996  
     997          def badencodereturn2(exc):
     998              return ("?", None)
     999          codecs.register_error("test.badencodereturn2", badencodereturn2)
    1000          self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
    1001  
    1002          handler = PosReturn()
    1003          codecs.register_error("test.posreturn", handler.handle)
    1004  
    1005          # Valid negative position
    1006          handler.pos = -1
    1007          self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
    1008  
    1009          # Valid negative position
    1010          handler.pos = -2
    1011          self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
    1012  
    1013          # Negative position out of bounds
    1014          handler.pos = -3
    1015          self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
    1016  
    1017          # Valid positive position
    1018          handler.pos = 1
    1019          self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
    1020  
    1021          # Largest valid positive position (one beyond end of input
    1022          handler.pos = 2
    1023          self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
    1024  
    1025          # Invalid positive position
    1026          handler.pos = 3
    1027          self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
    1028  
    1029          handler.pos = 0
    1030  
    1031          class ESC[4;38;5;81mD(ESC[4;38;5;149mdict):
    1032              def __getitem__(self, key):
    1033                  raise ValueError
    1034          for err in ("strict", "replace", "xmlcharrefreplace",
    1035                      "backslashreplace", "namereplace", "test.posreturn"):
    1036              self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
    1037              self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
    1038              self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
    1039  
    1040      def test_decodehelper_bug36819(self):
    1041          handler = RepeatedPosReturn("x")
    1042          codecs.register_error("test.bug36819", handler.handle)
    1043  
    1044          testcases = [
    1045              ("ascii", b"\xff"),
    1046              ("utf-8", b"\xff"),
    1047              ("utf-16be", b'\xdc\x80'),
    1048              ("utf-32be", b'\x00\x00\xdc\x80'),
    1049              ("iso-8859-6", b"\xff"),
    1050          ]
    1051          for enc, bad in testcases:
    1052              input = "abcd".encode(enc) + bad
    1053              with self.subTest(encoding=enc):
    1054                  handler.count = 50
    1055                  decoded = input.decode(enc, "test.bug36819")
    1056                  self.assertEqual(decoded, 'abcdx' * 51)
    1057  
    1058      def test_encodehelper_bug36819(self):
    1059          handler = RepeatedPosReturn()
    1060          codecs.register_error("test.bug36819", handler.handle)
    1061  
    1062          input = "abcd\udc80"
    1063          encodings = ["ascii", "latin1", "utf-8", "utf-16", "utf-32"]  # built-in
    1064          encodings += ["iso-8859-15"]  # charmap codec
    1065          if sys.platform == 'win32':
    1066              encodings = ["mbcs", "oem"]  # code page codecs
    1067  
    1068          handler.repl = "\udcff"
    1069          for enc in encodings:
    1070              with self.subTest(encoding=enc):
    1071                  handler.count = 50
    1072                  with self.assertRaises(UnicodeEncodeError) as cm:
    1073                      input.encode(enc, "test.bug36819")
    1074                  exc = cm.exception
    1075                  self.assertEqual(exc.start, 4)
    1076                  self.assertEqual(exc.end, 5)
    1077                  self.assertEqual(exc.object, input)
    1078          if sys.platform == "win32":
    1079              handler.count = 50
    1080              with self.assertRaises(UnicodeEncodeError) as cm:
    1081                  codecs.code_page_encode(437, input, "test.bug36819")
    1082              exc = cm.exception
    1083              self.assertEqual(exc.start, 4)
    1084              self.assertEqual(exc.end, 5)
    1085              self.assertEqual(exc.object, input)
    1086  
    1087          handler.repl = "x"
    1088          for enc in encodings:
    1089              with self.subTest(encoding=enc):
    1090                  # The interpreter should segfault after a handful of attempts.
    1091                  # 50 was chosen to try to ensure a segfault without a fix,
    1092                  # but not OOM a machine with one.
    1093                  handler.count = 50
    1094                  encoded = input.encode(enc, "test.bug36819")
    1095                  self.assertEqual(encoded.decode(enc), "abcdx" * 51)
    1096          if sys.platform == "win32":
    1097              handler.count = 50
    1098              encoded = codecs.code_page_encode(437, input, "test.bug36819")
    1099              self.assertEqual(encoded[0].decode(), "abcdx" * 51)
    1100              self.assertEqual(encoded[1], len(input))
    1101  
    1102      def test_translatehelper(self):
    1103          # enhance coverage of:
    1104          # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
    1105          # and callers
    1106          # (Unfortunately the errors argument is not directly accessible
    1107          # from Python, so we can't test that much)
    1108          class ESC[4;38;5;81mD(ESC[4;38;5;149mdict):
    1109              def __getitem__(self, key):
    1110                  raise ValueError
    1111          #self.assertRaises(ValueError, "\xff".translate, D())
    1112          self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
    1113          self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
    1114  
    1115      def test_bug828737(self):
    1116          charmap = {
    1117              ord("&"): "&amp;",
    1118              ord("<"): "&lt;",
    1119              ord(">"): "&gt;",
    1120              ord('"'): "&quot;",
    1121          }
    1122  
    1123          for n in (1, 10, 100, 1000):
    1124              text = 'abc<def>ghi'*n
    1125              text.translate(charmap)
    1126  
    1127      def test_mutatingdecodehandler(self):
    1128          baddata = [
    1129              ("ascii", b"\xff"),
    1130              ("utf-7", b"++"),
    1131              ("utf-8",  b"\xff"),
    1132              ("utf-16", b"\xff"),
    1133              ("utf-32", b"\xff"),
    1134              ("unicode-escape", b"\\u123g"),
    1135              ("raw-unicode-escape", b"\\u123g"),
    1136          ]
    1137  
    1138          def replacing(exc):
    1139              if isinstance(exc, UnicodeDecodeError):
    1140                  exc.object = 42
    1141                  return ("\u4242", 0)
    1142              else:
    1143                  raise TypeError("don't know how to handle %r" % exc)
    1144          codecs.register_error("test.replacing", replacing)
    1145  
    1146          for (encoding, data) in baddata:
    1147              with self.assertRaises(TypeError):
    1148                  data.decode(encoding, "test.replacing")
    1149  
    1150          def mutating(exc):
    1151              if isinstance(exc, UnicodeDecodeError):
    1152                  exc.object = b""
    1153                  return ("\u4242", 0)
    1154              else:
    1155                  raise TypeError("don't know how to handle %r" % exc)
    1156          codecs.register_error("test.mutating", mutating)
    1157          # If the decoder doesn't pick up the modified input the following
    1158          # will lead to an endless loop
    1159          for (encoding, data) in baddata:
    1160              self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
    1161  
    1162      # issue32583
    1163      def test_crashing_decode_handler(self):
    1164          # better generating one more character to fill the extra space slot
    1165          # so in debug build it can steadily fail
    1166          def forward_shorter_than_end(exc):
    1167              if isinstance(exc, UnicodeDecodeError):
    1168                  # size one character, 0 < forward < exc.end
    1169                  return ('\ufffd', exc.start+1)
    1170              else:
    1171                  raise TypeError("don't know how to handle %r" % exc)
    1172          codecs.register_error(
    1173              "test.forward_shorter_than_end", forward_shorter_than_end)
    1174  
    1175          self.assertEqual(
    1176              b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
    1177                  'utf-16-le', 'test.forward_shorter_than_end'),
    1178              '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
    1179          )
    1180          self.assertEqual(
    1181              b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
    1182                  'utf-16-be', 'test.forward_shorter_than_end'),
    1183              '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
    1184          )
    1185          self.assertEqual(
    1186              b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
    1187                  'utf-32-le', 'test.forward_shorter_than_end'),
    1188              '\ufffd\ufffd\ufffd\u1111\x00'
    1189          )
    1190          self.assertEqual(
    1191              b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
    1192                  'utf-32-be', 'test.forward_shorter_than_end'),
    1193              '\ufffd\ufffd\ufffd\u1111\x00'
    1194          )
    1195  
    1196          def replace_with_long(exc):
    1197              if isinstance(exc, UnicodeDecodeError):
    1198                  exc.object = b"\x00" * 8
    1199                  return ('\ufffd', exc.start)
    1200              else:
    1201                  raise TypeError("don't know how to handle %r" % exc)
    1202          codecs.register_error("test.replace_with_long", replace_with_long)
    1203  
    1204          self.assertEqual(
    1205              b'\x00'.decode('utf-16', 'test.replace_with_long'),
    1206              '\ufffd\x00\x00\x00\x00'
    1207          )
    1208          self.assertEqual(
    1209              b'\x00'.decode('utf-32', 'test.replace_with_long'),
    1210              '\ufffd\x00\x00'
    1211          )
    1212  
    1213  
    1214      def test_fake_error_class(self):
    1215          handlers = [
    1216              codecs.strict_errors,
    1217              codecs.ignore_errors,
    1218              codecs.replace_errors,
    1219              codecs.backslashreplace_errors,
    1220              codecs.namereplace_errors,
    1221              codecs.xmlcharrefreplace_errors,
    1222              codecs.lookup_error('surrogateescape'),
    1223              codecs.lookup_error('surrogatepass'),
    1224          ]
    1225          for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
    1226              class ESC[4;38;5;81mFakeUnicodeError(ESC[4;38;5;149mstr):
    1227                  __class__ = cls
    1228              for handler in handlers:
    1229                  with self.subTest(handler=handler, error_class=cls):
    1230                      self.assertRaises(TypeError, handler, FakeUnicodeError())
    1231              class ESC[4;38;5;81mFakeUnicodeError(ESC[4;38;5;149mException):
    1232                  __class__ = cls
    1233              for handler in handlers:
    1234                  with self.subTest(handler=handler, error_class=cls):
    1235                      with self.assertRaises((TypeError, FakeUnicodeError)):
    1236                          handler(FakeUnicodeError())
    1237  
    1238  
    1239  if __name__ == "__main__":
    1240      unittest.main()