python (3.11.7)
       1  # coding: utf-8
       2  """
       3  
       4      webencodings.tests
       5      ~~~~~~~~~~~~~~~~~~
       6  
       7      A basic test suite for Encoding.
       8  
       9      :copyright: Copyright 2012 by Simon Sapin
      10      :license: BSD, see LICENSE for details.
      11  
      12  """
      13  
      14  from __future__ import unicode_literals
      15  
      16  from . import (lookup, LABELS, decode, encode, iter_decode, iter_encode,
      17                 IncrementalDecoder, IncrementalEncoder, UTF8)
      18  
      19  
      20  def assert_raises(exception, function, *args, **kwargs):
      21      try:
      22          function(*args, **kwargs)
      23      except exception:
      24          return
      25      else:  # pragma: no cover
      26          raise AssertionError('Did not raise %s.' % exception)
      27  
      28  
      29  def test_labels():
      30      assert lookup('utf-8').name == 'utf-8'
      31      assert lookup('Utf-8').name == 'utf-8'
      32      assert lookup('UTF-8').name == 'utf-8'
      33      assert lookup('utf8').name == 'utf-8'
      34      assert lookup('utf8').name == 'utf-8'
      35      assert lookup('utf8 ').name == 'utf-8'
      36      assert lookup(' \r\nutf8\t').name == 'utf-8'
      37      assert lookup('u8') is None  # Python label.
      38      assert lookup('utf-8 ') is None  # Non-ASCII white space.
      39  
      40      assert lookup('US-ASCII').name == 'windows-1252'
      41      assert lookup('iso-8859-1').name == 'windows-1252'
      42      assert lookup('latin1').name == 'windows-1252'
      43      assert lookup('LATIN1').name == 'windows-1252'
      44      assert lookup('latin-1') is None
      45      assert lookup('LATİN1') is None  # ASCII-only case insensitivity.
      46  
      47  
      48  def test_all_labels():
      49      for label in LABELS:
      50          assert decode(b'', label) == ('', lookup(label))
      51          assert encode('', label) == b''
      52          for repeat in [0, 1, 12]:
      53              output, _ = iter_decode([b''] * repeat, label)
      54              assert list(output) == []
      55              assert list(iter_encode([''] * repeat, label)) == []
      56          decoder = IncrementalDecoder(label)
      57          assert decoder.decode(b'') == ''
      58          assert decoder.decode(b'', final=True) == ''
      59          encoder = IncrementalEncoder(label)
      60          assert encoder.encode('') == b''
      61          assert encoder.encode('', final=True) == b''
      62      # All encoding names are valid labels too:
      63      for name in set(LABELS.values()):
      64          assert lookup(name).name == name
      65  
      66  
      67  def test_invalid_label():
      68      assert_raises(LookupError, decode, b'\xEF\xBB\xBF\xc3\xa9', 'invalid')
      69      assert_raises(LookupError, encode, 'é', 'invalid')
      70      assert_raises(LookupError, iter_decode, [], 'invalid')
      71      assert_raises(LookupError, iter_encode, [], 'invalid')
      72      assert_raises(LookupError, IncrementalDecoder, 'invalid')
      73      assert_raises(LookupError, IncrementalEncoder, 'invalid')
      74  
      75  
      76  def test_decode():
      77      assert decode(b'\x80', 'latin1') == ('€', lookup('latin1'))
      78      assert decode(b'\x80', lookup('latin1')) == ('€', lookup('latin1'))
      79      assert decode(b'\xc3\xa9', 'utf8') == ('é', lookup('utf8'))
      80      assert decode(b'\xc3\xa9', UTF8) == ('é', lookup('utf8'))
      81      assert decode(b'\xc3\xa9', 'ascii') == ('é', lookup('ascii'))
      82      assert decode(b'\xEF\xBB\xBF\xc3\xa9', 'ascii') == ('é', lookup('utf8'))  # UTF-8 with BOM
      83  
      84      assert decode(b'\xFE\xFF\x00\xe9', 'ascii') == ('é', lookup('utf-16be'))  # UTF-16-BE with BOM
      85      assert decode(b'\xFF\xFE\xe9\x00', 'ascii') == ('é', lookup('utf-16le'))  # UTF-16-LE with BOM
      86      assert decode(b'\xFE\xFF\xe9\x00', 'ascii') == ('\ue900', lookup('utf-16be'))
      87      assert decode(b'\xFF\xFE\x00\xe9', 'ascii') == ('\ue900', lookup('utf-16le'))
      88  
      89      assert decode(b'\x00\xe9', 'UTF-16BE') == ('é', lookup('utf-16be'))
      90      assert decode(b'\xe9\x00', 'UTF-16LE') == ('é', lookup('utf-16le'))
      91      assert decode(b'\xe9\x00', 'UTF-16') == ('é', lookup('utf-16le'))
      92  
      93      assert decode(b'\xe9\x00', 'UTF-16BE') == ('\ue900', lookup('utf-16be'))
      94      assert decode(b'\x00\xe9', 'UTF-16LE') == ('\ue900', lookup('utf-16le'))
      95      assert decode(b'\x00\xe9', 'UTF-16') == ('\ue900', lookup('utf-16le'))
      96  
      97  
      98  def test_encode():
      99      assert encode('é', 'latin1') == b'\xe9'
     100      assert encode('é', 'utf8') == b'\xc3\xa9'
     101      assert encode('é', 'utf8') == b'\xc3\xa9'
     102      assert encode('é', 'utf-16') == b'\xe9\x00'
     103      assert encode('é', 'utf-16le') == b'\xe9\x00'
     104      assert encode('é', 'utf-16be') == b'\x00\xe9'
     105  
     106  
     107  def test_iter_decode():
     108      def iter_decode_to_string(input, fallback_encoding):
     109          output, _encoding = iter_decode(input, fallback_encoding)
     110          return ''.join(output)
     111      assert iter_decode_to_string([], 'latin1') == ''
     112      assert iter_decode_to_string([b''], 'latin1') == ''
     113      assert iter_decode_to_string([b'\xe9'], 'latin1') == 'é'
     114      assert iter_decode_to_string([b'hello'], 'latin1') == 'hello'
     115      assert iter_decode_to_string([b'he', b'llo'], 'latin1') == 'hello'
     116      assert iter_decode_to_string([b'hell', b'o'], 'latin1') == 'hello'
     117      assert iter_decode_to_string([b'\xc3\xa9'], 'latin1') == 'é'
     118      assert iter_decode_to_string([b'\xEF\xBB\xBF\xc3\xa9'], 'latin1') == 'é'
     119      assert iter_decode_to_string([
     120          b'\xEF\xBB\xBF', b'\xc3', b'\xa9'], 'latin1') == 'é'
     121      assert iter_decode_to_string([
     122          b'\xEF\xBB\xBF', b'a', b'\xc3'], 'latin1') == 'a\uFFFD'
     123      assert iter_decode_to_string([
     124          b'', b'\xEF', b'', b'', b'\xBB\xBF\xc3', b'\xa9'], 'latin1') == 'é'
     125      assert iter_decode_to_string([b'\xEF\xBB\xBF'], 'latin1') == ''
     126      assert iter_decode_to_string([b'\xEF\xBB'], 'latin1') == 'ï»'
     127      assert iter_decode_to_string([b'\xFE\xFF\x00\xe9'], 'latin1') == 'é'
     128      assert iter_decode_to_string([b'\xFF\xFE\xe9\x00'], 'latin1') == 'é'
     129      assert iter_decode_to_string([
     130          b'', b'\xFF', b'', b'', b'\xFE\xe9', b'\x00'], 'latin1') == 'é'
     131      assert iter_decode_to_string([
     132          b'', b'h\xe9', b'llo'], 'x-user-defined') == 'h\uF7E9llo'
     133  
     134  
     135  def test_iter_encode():
     136      assert b''.join(iter_encode([], 'latin1')) == b''
     137      assert b''.join(iter_encode([''], 'latin1')) == b''
     138      assert b''.join(iter_encode(['é'], 'latin1')) == b'\xe9'
     139      assert b''.join(iter_encode(['', 'é', '', ''], 'latin1')) == b'\xe9'
     140      assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16')) == b'\xe9\x00'
     141      assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16le')) == b'\xe9\x00'
     142      assert b''.join(iter_encode(['', 'é', '', ''], 'utf-16be')) == b'\x00\xe9'
     143      assert b''.join(iter_encode([
     144          '', 'h\uF7E9', '', 'llo'], 'x-user-defined')) == b'h\xe9llo'
     145  
     146  
     147  def test_x_user_defined():
     148      encoded = b'2,\x0c\x0b\x1aO\xd9#\xcb\x0f\xc9\xbbt\xcf\xa8\xca'
     149      decoded = '2,\x0c\x0b\x1aO\uf7d9#\uf7cb\x0f\uf7c9\uf7bbt\uf7cf\uf7a8\uf7ca'
     150      encoded = b'aa'
     151      decoded = 'aa'
     152      assert decode(encoded, 'x-user-defined') == (decoded, lookup('x-user-defined'))
     153      assert encode(decoded, 'x-user-defined') == encoded