(root)/
Python-3.12.0/
Lib/
codecs.py
       1  """ codecs -- Python Codec Registry, API and helpers.
       2  
       3  
       4  Written by Marc-Andre Lemburg (mal@lemburg.com).
       5  
       6  (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       7  
       8  """
       9  
      10  import builtins
      11  import sys
      12  
      13  ### Registry and builtin stateless codec functions
      14  
      15  try:
      16      from _codecs import *
      17  except ImportError as why:
      18      raise SystemError('Failed to load the builtin codecs: %s' % why)
      19  
      20  __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
      21             "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
      22             "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
      23             "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
      24             "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
      25             "StreamReader", "StreamWriter",
      26             "StreamReaderWriter", "StreamRecoder",
      27             "getencoder", "getdecoder", "getincrementalencoder",
      28             "getincrementaldecoder", "getreader", "getwriter",
      29             "encode", "decode", "iterencode", "iterdecode",
      30             "strict_errors", "ignore_errors", "replace_errors",
      31             "xmlcharrefreplace_errors",
      32             "backslashreplace_errors", "namereplace_errors",
      33             "register_error", "lookup_error"]
      34  
      35  ### Constants
      36  
      37  #
      38  # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
      39  # and its possible byte string values
      40  # for UTF8/UTF16/UTF32 output and little/big endian machines
      41  #
      42  
      43  # UTF-8
      44  BOM_UTF8 = b'\xef\xbb\xbf'
      45  
      46  # UTF-16, little endian
      47  BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
      48  
      49  # UTF-16, big endian
      50  BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
      51  
      52  # UTF-32, little endian
      53  BOM_UTF32_LE = b'\xff\xfe\x00\x00'
      54  
      55  # UTF-32, big endian
      56  BOM_UTF32_BE = b'\x00\x00\xfe\xff'
      57  
      58  if sys.byteorder == 'little':
      59  
      60      # UTF-16, native endianness
      61      BOM = BOM_UTF16 = BOM_UTF16_LE
      62  
      63      # UTF-32, native endianness
      64      BOM_UTF32 = BOM_UTF32_LE
      65  
      66  else:
      67  
      68      # UTF-16, native endianness
      69      BOM = BOM_UTF16 = BOM_UTF16_BE
      70  
      71      # UTF-32, native endianness
      72      BOM_UTF32 = BOM_UTF32_BE
      73  
      74  # Old broken names (don't use in new code)
      75  BOM32_LE = BOM_UTF16_LE
      76  BOM32_BE = BOM_UTF16_BE
      77  BOM64_LE = BOM_UTF32_LE
      78  BOM64_BE = BOM_UTF32_BE
      79  
      80  
      81  ### Codec base classes (defining the API)
      82  
      83  class ESC[4;38;5;81mCodecInfo(ESC[4;38;5;149mtuple):
      84      """Codec details when looking up the codec registry"""
      85  
      86      # Private API to allow Python 3.4 to denylist the known non-Unicode
      87      # codecs in the standard library. A more general mechanism to
      88      # reliably distinguish test encodings from other codecs will hopefully
      89      # be defined for Python 3.5
      90      #
      91      # See http://bugs.python.org/issue19619
      92      _is_text_encoding = True # Assume codecs are text encodings by default
      93  
      94      def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
      95          incrementalencoder=None, incrementaldecoder=None, name=None,
      96          *, _is_text_encoding=None):
      97          self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
      98          self.name = name
      99          self.encode = encode
     100          self.decode = decode
     101          self.incrementalencoder = incrementalencoder
     102          self.incrementaldecoder = incrementaldecoder
     103          self.streamwriter = streamwriter
     104          self.streamreader = streamreader
     105          if _is_text_encoding is not None:
     106              self._is_text_encoding = _is_text_encoding
     107          return self
     108  
     109      def __repr__(self):
     110          return "<%s.%s object for encoding %s at %#x>" % \
     111                  (self.__class__.__module__, self.__class__.__qualname__,
     112                   self.name, id(self))
     113  
     114  class ESC[4;38;5;81mCodec:
     115  
     116      """ Defines the interface for stateless encoders/decoders.
     117  
     118          The .encode()/.decode() methods may use different error
     119          handling schemes by providing the errors argument. These
     120          string values are predefined:
     121  
     122           'strict' - raise a ValueError error (or a subclass)
     123           'ignore' - ignore the character and continue with the next
     124           'replace' - replace with a suitable replacement character;
     125                      Python will use the official U+FFFD REPLACEMENT
     126                      CHARACTER for the builtin Unicode codecs on
     127                      decoding and '?' on encoding.
     128           'surrogateescape' - replace with private code points U+DCnn.
     129           'xmlcharrefreplace' - Replace with the appropriate XML
     130                                 character reference (only for encoding).
     131           'backslashreplace'  - Replace with backslashed escape sequences.
     132           'namereplace'       - Replace with \\N{...} escape sequences
     133                                 (only for encoding).
     134  
     135          The set of allowed values can be extended via register_error.
     136  
     137      """
     138      def encode(self, input, errors='strict'):
     139  
     140          """ Encodes the object input and returns a tuple (output
     141              object, length consumed).
     142  
     143              errors defines the error handling to apply. It defaults to
     144              'strict' handling.
     145  
     146              The method may not store state in the Codec instance. Use
     147              StreamWriter for codecs which have to keep state in order to
     148              make encoding efficient.
     149  
     150              The encoder must be able to handle zero length input and
     151              return an empty object of the output object type in this
     152              situation.
     153  
     154          """
     155          raise NotImplementedError
     156  
     157      def decode(self, input, errors='strict'):
     158  
     159          """ Decodes the object input and returns a tuple (output
     160              object, length consumed).
     161  
     162              input must be an object which provides the bf_getreadbuf
     163              buffer slot. Python strings, buffer objects and memory
     164              mapped files are examples of objects providing this slot.
     165  
     166              errors defines the error handling to apply. It defaults to
     167              'strict' handling.
     168  
     169              The method may not store state in the Codec instance. Use
     170              StreamReader for codecs which have to keep state in order to
     171              make decoding efficient.
     172  
     173              The decoder must be able to handle zero length input and
     174              return an empty object of the output object type in this
     175              situation.
     176  
     177          """
     178          raise NotImplementedError
     179  
     180  class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mobject):
     181      """
     182      An IncrementalEncoder encodes an input in multiple steps. The input can
     183      be passed piece by piece to the encode() method. The IncrementalEncoder
     184      remembers the state of the encoding process between calls to encode().
     185      """
     186      def __init__(self, errors='strict'):
     187          """
     188          Creates an IncrementalEncoder instance.
     189  
     190          The IncrementalEncoder may use different error handling schemes by
     191          providing the errors keyword argument. See the module docstring
     192          for a list of possible values.
     193          """
     194          self.errors = errors
     195          self.buffer = ""
     196  
     197      def encode(self, input, final=False):
     198          """
     199          Encodes input and returns the resulting object.
     200          """
     201          raise NotImplementedError
     202  
     203      def reset(self):
     204          """
     205          Resets the encoder to the initial state.
     206          """
     207  
     208      def getstate(self):
     209          """
     210          Return the current state of the encoder.
     211          """
     212          return 0
     213  
     214      def setstate(self, state):
     215          """
     216          Set the current state of the encoder. state must have been
     217          returned by getstate().
     218          """
     219  
     220  class ESC[4;38;5;81mBufferedIncrementalEncoder(ESC[4;38;5;149mIncrementalEncoder):
     221      """
     222      This subclass of IncrementalEncoder can be used as the baseclass for an
     223      incremental encoder if the encoder must keep some of the output in a
     224      buffer between calls to encode().
     225      """
     226      def __init__(self, errors='strict'):
     227          IncrementalEncoder.__init__(self, errors)
     228          # unencoded input that is kept between calls to encode()
     229          self.buffer = ""
     230  
     231      def _buffer_encode(self, input, errors, final):
     232          # Overwrite this method in subclasses: It must encode input
     233          # and return an (output, length consumed) tuple
     234          raise NotImplementedError
     235  
     236      def encode(self, input, final=False):
     237          # encode input (taking the buffer into account)
     238          data = self.buffer + input
     239          (result, consumed) = self._buffer_encode(data, self.errors, final)
     240          # keep unencoded input until the next call
     241          self.buffer = data[consumed:]
     242          return result
     243  
     244      def reset(self):
     245          IncrementalEncoder.reset(self)
     246          self.buffer = ""
     247  
     248      def getstate(self):
     249          return self.buffer or 0
     250  
     251      def setstate(self, state):
     252          self.buffer = state or ""
     253  
     254  class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mobject):
     255      """
     256      An IncrementalDecoder decodes an input in multiple steps. The input can
     257      be passed piece by piece to the decode() method. The IncrementalDecoder
     258      remembers the state of the decoding process between calls to decode().
     259      """
     260      def __init__(self, errors='strict'):
     261          """
     262          Create an IncrementalDecoder instance.
     263  
     264          The IncrementalDecoder may use different error handling schemes by
     265          providing the errors keyword argument. See the module docstring
     266          for a list of possible values.
     267          """
     268          self.errors = errors
     269  
     270      def decode(self, input, final=False):
     271          """
     272          Decode input and returns the resulting object.
     273          """
     274          raise NotImplementedError
     275  
     276      def reset(self):
     277          """
     278          Reset the decoder to the initial state.
     279          """
     280  
     281      def getstate(self):
     282          """
     283          Return the current state of the decoder.
     284  
     285          This must be a (buffered_input, additional_state_info) tuple.
     286          buffered_input must be a bytes object containing bytes that
     287          were passed to decode() that have not yet been converted.
     288          additional_state_info must be a non-negative integer
     289          representing the state of the decoder WITHOUT yet having
     290          processed the contents of buffered_input.  In the initial state
     291          and after reset(), getstate() must return (b"", 0).
     292          """
     293          return (b"", 0)
     294  
     295      def setstate(self, state):
     296          """
     297          Set the current state of the decoder.
     298  
     299          state must have been returned by getstate().  The effect of
     300          setstate((b"", 0)) must be equivalent to reset().
     301          """
     302  
     303  class ESC[4;38;5;81mBufferedIncrementalDecoder(ESC[4;38;5;149mIncrementalDecoder):
     304      """
     305      This subclass of IncrementalDecoder can be used as the baseclass for an
     306      incremental decoder if the decoder must be able to handle incomplete
     307      byte sequences.
     308      """
     309      def __init__(self, errors='strict'):
     310          IncrementalDecoder.__init__(self, errors)
     311          # undecoded input that is kept between calls to decode()
     312          self.buffer = b""
     313  
     314      def _buffer_decode(self, input, errors, final):
     315          # Overwrite this method in subclasses: It must decode input
     316          # and return an (output, length consumed) tuple
     317          raise NotImplementedError
     318  
     319      def decode(self, input, final=False):
     320          # decode input (taking the buffer into account)
     321          data = self.buffer + input
     322          (result, consumed) = self._buffer_decode(data, self.errors, final)
     323          # keep undecoded input until the next call
     324          self.buffer = data[consumed:]
     325          return result
     326  
     327      def reset(self):
     328          IncrementalDecoder.reset(self)
     329          self.buffer = b""
     330  
     331      def getstate(self):
     332          # additional state info is always 0
     333          return (self.buffer, 0)
     334  
     335      def setstate(self, state):
     336          # ignore additional state info
     337          self.buffer = state[0]
     338  
     339  #
     340  # The StreamWriter and StreamReader class provide generic working
     341  # interfaces which can be used to implement new encoding submodules
     342  # very easily. See encodings/utf_8.py for an example on how this is
     343  # done.
     344  #
     345  
     346  class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mCodec):
     347  
     348      def __init__(self, stream, errors='strict'):
     349  
     350          """ Creates a StreamWriter instance.
     351  
     352              stream must be a file-like object open for writing.
     353  
     354              The StreamWriter may use different error handling
     355              schemes by providing the errors keyword argument. These
     356              parameters are predefined:
     357  
     358               'strict' - raise a ValueError (or a subclass)
     359               'ignore' - ignore the character and continue with the next
     360               'replace'- replace with a suitable replacement character
     361               'xmlcharrefreplace' - Replace with the appropriate XML
     362                                     character reference.
     363               'backslashreplace'  - Replace with backslashed escape
     364                                     sequences.
     365               'namereplace'       - Replace with \\N{...} escape sequences.
     366  
     367              The set of allowed parameter values can be extended via
     368              register_error.
     369          """
     370          self.stream = stream
     371          self.errors = errors
     372  
     373      def write(self, object):
     374  
     375          """ Writes the object's contents encoded to self.stream.
     376          """
     377          data, consumed = self.encode(object, self.errors)
     378          self.stream.write(data)
     379  
     380      def writelines(self, list):
     381  
     382          """ Writes the concatenated list of strings to the stream
     383              using .write().
     384          """
     385          self.write(''.join(list))
     386  
     387      def reset(self):
     388  
     389          """ Resets the codec buffers used for keeping internal state.
     390  
     391              Calling this method should ensure that the data on the
     392              output is put into a clean state, that allows appending
     393              of new fresh data without having to rescan the whole
     394              stream to recover state.
     395  
     396          """
     397          pass
     398  
     399      def seek(self, offset, whence=0):
     400          self.stream.seek(offset, whence)
     401          if whence == 0 and offset == 0:
     402              self.reset()
     403  
     404      def __getattr__(self, name,
     405                      getattr=getattr):
     406  
     407          """ Inherit all other methods from the underlying stream.
     408          """
     409          return getattr(self.stream, name)
     410  
     411      def __enter__(self):
     412          return self
     413  
     414      def __exit__(self, type, value, tb):
     415          self.stream.close()
     416  
     417  ###
     418  
     419  class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mCodec):
     420  
     421      charbuffertype = str
     422  
     423      def __init__(self, stream, errors='strict'):
     424  
     425          """ Creates a StreamReader instance.
     426  
     427              stream must be a file-like object open for reading.
     428  
     429              The StreamReader may use different error handling
     430              schemes by providing the errors keyword argument. These
     431              parameters are predefined:
     432  
     433               'strict' - raise a ValueError (or a subclass)
     434               'ignore' - ignore the character and continue with the next
     435               'replace'- replace with a suitable replacement character
     436               'backslashreplace' - Replace with backslashed escape sequences;
     437  
     438              The set of allowed parameter values can be extended via
     439              register_error.
     440          """
     441          self.stream = stream
     442          self.errors = errors
     443          self.bytebuffer = b""
     444          self._empty_charbuffer = self.charbuffertype()
     445          self.charbuffer = self._empty_charbuffer
     446          self.linebuffer = None
     447  
     448      def decode(self, input, errors='strict'):
     449          raise NotImplementedError
     450  
     451      def read(self, size=-1, chars=-1, firstline=False):
     452  
     453          """ Decodes data from the stream self.stream and returns the
     454              resulting object.
     455  
     456              chars indicates the number of decoded code points or bytes to
     457              return. read() will never return more data than requested,
     458              but it might return less, if there is not enough available.
     459  
     460              size indicates the approximate maximum number of decoded
     461              bytes or code points to read for decoding. The decoder
     462              can modify this setting as appropriate. The default value
     463              -1 indicates to read and decode as much as possible.  size
     464              is intended to prevent having to decode huge files in one
     465              step.
     466  
     467              If firstline is true, and a UnicodeDecodeError happens
     468              after the first line terminator in the input only the first line
     469              will be returned, the rest of the input will be kept until the
     470              next call to read().
     471  
     472              The method should use a greedy read strategy, meaning that
     473              it should read as much data as is allowed within the
     474              definition of the encoding and the given size, e.g.  if
     475              optional encoding endings or state markers are available
     476              on the stream, these should be read too.
     477          """
     478          # If we have lines cached, first merge them back into characters
     479          if self.linebuffer:
     480              self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
     481              self.linebuffer = None
     482  
     483          if chars < 0:
     484              # For compatibility with other read() methods that take a
     485              # single argument
     486              chars = size
     487  
     488          # read until we get the required number of characters (if available)
     489          while True:
     490              # can the request be satisfied from the character buffer?
     491              if chars >= 0:
     492                  if len(self.charbuffer) >= chars:
     493                      break
     494              # we need more data
     495              if size < 0:
     496                  newdata = self.stream.read()
     497              else:
     498                  newdata = self.stream.read(size)
     499              # decode bytes (those remaining from the last call included)
     500              data = self.bytebuffer + newdata
     501              if not data:
     502                  break
     503              try:
     504                  newchars, decodedbytes = self.decode(data, self.errors)
     505              except UnicodeDecodeError as exc:
     506                  if firstline:
     507                      newchars, decodedbytes = \
     508                          self.decode(data[:exc.start], self.errors)
     509                      lines = newchars.splitlines(keepends=True)
     510                      if len(lines)<=1:
     511                          raise
     512                  else:
     513                      raise
     514              # keep undecoded bytes until the next call
     515              self.bytebuffer = data[decodedbytes:]
     516              # put new characters in the character buffer
     517              self.charbuffer += newchars
     518              # there was no data available
     519              if not newdata:
     520                  break
     521          if chars < 0:
     522              # Return everything we've got
     523              result = self.charbuffer
     524              self.charbuffer = self._empty_charbuffer
     525          else:
     526              # Return the first chars characters
     527              result = self.charbuffer[:chars]
     528              self.charbuffer = self.charbuffer[chars:]
     529          return result
     530  
     531      def readline(self, size=None, keepends=True):
     532  
     533          """ Read one line from the input stream and return the
     534              decoded data.
     535  
     536              size, if given, is passed as size argument to the
     537              read() method.
     538  
     539          """
     540          # If we have lines cached from an earlier read, return
     541          # them unconditionally
     542          if self.linebuffer:
     543              line = self.linebuffer[0]
     544              del self.linebuffer[0]
     545              if len(self.linebuffer) == 1:
     546                  # revert to charbuffer mode; we might need more data
     547                  # next time
     548                  self.charbuffer = self.linebuffer[0]
     549                  self.linebuffer = None
     550              if not keepends:
     551                  line = line.splitlines(keepends=False)[0]
     552              return line
     553  
     554          readsize = size or 72
     555          line = self._empty_charbuffer
     556          # If size is given, we call read() only once
     557          while True:
     558              data = self.read(readsize, firstline=True)
     559              if data:
     560                  # If we're at a "\r" read one extra character (which might
     561                  # be a "\n") to get a proper line ending. If the stream is
     562                  # temporarily exhausted we return the wrong line ending.
     563                  if (isinstance(data, str) and data.endswith("\r")) or \
     564                     (isinstance(data, bytes) and data.endswith(b"\r")):
     565                      data += self.read(size=1, chars=1)
     566  
     567              line += data
     568              lines = line.splitlines(keepends=True)
     569              if lines:
     570                  if len(lines) > 1:
     571                      # More than one line result; the first line is a full line
     572                      # to return
     573                      line = lines[0]
     574                      del lines[0]
     575                      if len(lines) > 1:
     576                          # cache the remaining lines
     577                          lines[-1] += self.charbuffer
     578                          self.linebuffer = lines
     579                          self.charbuffer = None
     580                      else:
     581                          # only one remaining line, put it back into charbuffer
     582                          self.charbuffer = lines[0] + self.charbuffer
     583                      if not keepends:
     584                          line = line.splitlines(keepends=False)[0]
     585                      break
     586                  line0withend = lines[0]
     587                  line0withoutend = lines[0].splitlines(keepends=False)[0]
     588                  if line0withend != line0withoutend: # We really have a line end
     589                      # Put the rest back together and keep it until the next call
     590                      self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
     591                                        self.charbuffer
     592                      if keepends:
     593                          line = line0withend
     594                      else:
     595                          line = line0withoutend
     596                      break
     597              # we didn't get anything or this was our only try
     598              if not data or size is not None:
     599                  if line and not keepends:
     600                      line = line.splitlines(keepends=False)[0]
     601                  break
     602              if readsize < 8000:
     603                  readsize *= 2
     604          return line
     605  
     606      def readlines(self, sizehint=None, keepends=True):
     607  
     608          """ Read all lines available on the input stream
     609              and return them as a list.
     610  
     611              Line breaks are implemented using the codec's decoder
     612              method and are included in the list entries.
     613  
     614              sizehint, if given, is ignored since there is no efficient
     615              way to finding the true end-of-line.
     616  
     617          """
     618          data = self.read()
     619          return data.splitlines(keepends)
     620  
     621      def reset(self):
     622  
     623          """ Resets the codec buffers used for keeping internal state.
     624  
     625              Note that no stream repositioning should take place.
     626              This method is primarily intended to be able to recover
     627              from decoding errors.
     628  
     629          """
     630          self.bytebuffer = b""
     631          self.charbuffer = self._empty_charbuffer
     632          self.linebuffer = None
     633  
     634      def seek(self, offset, whence=0):
     635          """ Set the input stream's current position.
     636  
     637              Resets the codec buffers used for keeping state.
     638          """
     639          self.stream.seek(offset, whence)
     640          self.reset()
     641  
     642      def __next__(self):
     643  
     644          """ Return the next decoded line from the input stream."""
     645          line = self.readline()
     646          if line:
     647              return line
     648          raise StopIteration
     649  
     650      def __iter__(self):
     651          return self
     652  
     653      def __getattr__(self, name,
     654                      getattr=getattr):
     655  
     656          """ Inherit all other methods from the underlying stream.
     657          """
     658          return getattr(self.stream, name)
     659  
     660      def __enter__(self):
     661          return self
     662  
     663      def __exit__(self, type, value, tb):
     664          self.stream.close()
     665  
     666  ###
     667  
     668  class ESC[4;38;5;81mStreamReaderWriter:
     669  
     670      """ StreamReaderWriter instances allow wrapping streams which
     671          work in both read and write modes.
     672  
     673          The design is such that one can use the factory functions
     674          returned by the codec.lookup() function to construct the
     675          instance.
     676  
     677      """
     678      # Optional attributes set by the file wrappers below
     679      encoding = 'unknown'
     680  
     681      def __init__(self, stream, Reader, Writer, errors='strict'):
     682  
     683          """ Creates a StreamReaderWriter instance.
     684  
     685              stream must be a Stream-like object.
     686  
     687              Reader, Writer must be factory functions or classes
     688              providing the StreamReader, StreamWriter interface resp.
     689  
     690              Error handling is done in the same way as defined for the
     691              StreamWriter/Readers.
     692  
     693          """
     694          self.stream = stream
     695          self.reader = Reader(stream, errors)
     696          self.writer = Writer(stream, errors)
     697          self.errors = errors
     698  
     699      def read(self, size=-1):
     700  
     701          return self.reader.read(size)
     702  
     703      def readline(self, size=None):
     704  
     705          return self.reader.readline(size)
     706  
     707      def readlines(self, sizehint=None):
     708  
     709          return self.reader.readlines(sizehint)
     710  
     711      def __next__(self):
     712  
     713          """ Return the next decoded line from the input stream."""
     714          return next(self.reader)
     715  
     716      def __iter__(self):
     717          return self
     718  
     719      def write(self, data):
     720  
     721          return self.writer.write(data)
     722  
     723      def writelines(self, list):
     724  
     725          return self.writer.writelines(list)
     726  
     727      def reset(self):
     728  
     729          self.reader.reset()
     730          self.writer.reset()
     731  
     732      def seek(self, offset, whence=0):
     733          self.stream.seek(offset, whence)
     734          self.reader.reset()
     735          if whence == 0 and offset == 0:
     736              self.writer.reset()
     737  
     738      def __getattr__(self, name,
     739                      getattr=getattr):
     740  
     741          """ Inherit all other methods from the underlying stream.
     742          """
     743          return getattr(self.stream, name)
     744  
     745      # these are needed to make "with StreamReaderWriter(...)" work properly
     746  
     747      def __enter__(self):
     748          return self
     749  
     750      def __exit__(self, type, value, tb):
     751          self.stream.close()
     752  
     753  ###
     754  
     755  class ESC[4;38;5;81mStreamRecoder:
     756  
     757      """ StreamRecoder instances translate data from one encoding to another.
     758  
     759          They use the complete set of APIs returned by the
     760          codecs.lookup() function to implement their task.
     761  
     762          Data written to the StreamRecoder is first decoded into an
     763          intermediate format (depending on the "decode" codec) and then
     764          written to the underlying stream using an instance of the provided
     765          Writer class.
     766  
     767          In the other direction, data is read from the underlying stream using
     768          a Reader instance and then encoded and returned to the caller.
     769  
     770      """
     771      # Optional attributes set by the file wrappers below
     772      data_encoding = 'unknown'
     773      file_encoding = 'unknown'
     774  
     775      def __init__(self, stream, encode, decode, Reader, Writer,
     776                   errors='strict'):
     777  
     778          """ Creates a StreamRecoder instance which implements a two-way
     779              conversion: encode and decode work on the frontend (the
     780              data visible to .read() and .write()) while Reader and Writer
     781              work on the backend (the data in stream).
     782  
     783              You can use these objects to do transparent
     784              transcodings from e.g. latin-1 to utf-8 and back.
     785  
     786              stream must be a file-like object.
     787  
     788              encode and decode must adhere to the Codec interface; Reader and
     789              Writer must be factory functions or classes providing the
     790              StreamReader and StreamWriter interfaces resp.
     791  
     792              Error handling is done in the same way as defined for the
     793              StreamWriter/Readers.
     794  
     795          """
     796          self.stream = stream
     797          self.encode = encode
     798          self.decode = decode
     799          self.reader = Reader(stream, errors)
     800          self.writer = Writer(stream, errors)
     801          self.errors = errors
     802  
     803      def read(self, size=-1):
     804  
     805          data = self.reader.read(size)
     806          data, bytesencoded = self.encode(data, self.errors)
     807          return data
     808  
     809      def readline(self, size=None):
     810  
     811          if size is None:
     812              data = self.reader.readline()
     813          else:
     814              data = self.reader.readline(size)
     815          data, bytesencoded = self.encode(data, self.errors)
     816          return data
     817  
     818      def readlines(self, sizehint=None):
     819  
     820          data = self.reader.read()
     821          data, bytesencoded = self.encode(data, self.errors)
     822          return data.splitlines(keepends=True)
     823  
     824      def __next__(self):
     825  
     826          """ Return the next decoded line from the input stream."""
     827          data = next(self.reader)
     828          data, bytesencoded = self.encode(data, self.errors)
     829          return data
     830  
     831      def __iter__(self):
     832          return self
     833  
     834      def write(self, data):
     835  
     836          data, bytesdecoded = self.decode(data, self.errors)
     837          return self.writer.write(data)
     838  
     839      def writelines(self, list):
     840  
     841          data = b''.join(list)
     842          data, bytesdecoded = self.decode(data, self.errors)
     843          return self.writer.write(data)
     844  
     845      def reset(self):
     846  
     847          self.reader.reset()
     848          self.writer.reset()
     849  
     850      def seek(self, offset, whence=0):
     851          # Seeks must be propagated to both the readers and writers
     852          # as they might need to reset their internal buffers.
     853          self.reader.seek(offset, whence)
     854          self.writer.seek(offset, whence)
     855  
     856      def __getattr__(self, name,
     857                      getattr=getattr):
     858  
     859          """ Inherit all other methods from the underlying stream.
     860          """
     861          return getattr(self.stream, name)
     862  
     863      def __enter__(self):
     864          return self
     865  
     866      def __exit__(self, type, value, tb):
     867          self.stream.close()
     868  
     869  ### Shortcuts
     870  
     871  def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
     872  
     873      """ Open an encoded file using the given mode and return
     874          a wrapped version providing transparent encoding/decoding.
     875  
     876          Note: The wrapped version will only accept the object format
     877          defined by the codecs, i.e. Unicode objects for most builtin
     878          codecs. Output is also codec dependent and will usually be
     879          Unicode as well.
     880  
     881          If encoding is not None, then the
     882          underlying encoded files are always opened in binary mode.
     883          The default file mode is 'r', meaning to open the file in read mode.
     884  
     885          encoding specifies the encoding which is to be used for the
     886          file.
     887  
     888          errors may be given to define the error handling. It defaults
     889          to 'strict' which causes ValueErrors to be raised in case an
     890          encoding error occurs.
     891  
     892          buffering has the same meaning as for the builtin open() API.
     893          It defaults to -1 which means that the default buffer size will
     894          be used.
     895  
     896          The returned wrapped file object provides an extra attribute
     897          .encoding which allows querying the used encoding. This
     898          attribute is only available if an encoding was specified as
     899          parameter.
     900  
     901      """
     902      if encoding is not None and \
     903         'b' not in mode:
     904          # Force opening of the file in binary mode
     905          mode = mode + 'b'
     906      file = builtins.open(filename, mode, buffering)
     907      if encoding is None:
     908          return file
     909  
     910      try:
     911          info = lookup(encoding)
     912          srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
     913          # Add attributes to simplify introspection
     914          srw.encoding = encoding
     915          return srw
     916      except:
     917          file.close()
     918          raise
     919  
     920  def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
     921  
     922      """ Return a wrapped version of file which provides transparent
     923          encoding translation.
     924  
     925          Data written to the wrapped file is decoded according
     926          to the given data_encoding and then encoded to the underlying
     927          file using file_encoding. The intermediate data type
     928          will usually be Unicode but depends on the specified codecs.
     929  
     930          Bytes read from the file are decoded using file_encoding and then
     931          passed back to the caller encoded using data_encoding.
     932  
     933          If file_encoding is not given, it defaults to data_encoding.
     934  
     935          errors may be given to define the error handling. It defaults
     936          to 'strict' which causes ValueErrors to be raised in case an
     937          encoding error occurs.
     938  
     939          The returned wrapped file object provides two extra attributes
     940          .data_encoding and .file_encoding which reflect the given
     941          parameters of the same name. The attributes can be used for
     942          introspection by Python programs.
     943  
     944      """
     945      if file_encoding is None:
     946          file_encoding = data_encoding
     947      data_info = lookup(data_encoding)
     948      file_info = lookup(file_encoding)
     949      sr = StreamRecoder(file, data_info.encode, data_info.decode,
     950                         file_info.streamreader, file_info.streamwriter, errors)
     951      # Add attributes to simplify introspection
     952      sr.data_encoding = data_encoding
     953      sr.file_encoding = file_encoding
     954      return sr
     955  
     956  ### Helpers for codec lookup
     957  
     958  def getencoder(encoding):
     959  
     960      """ Lookup up the codec for the given encoding and return
     961          its encoder function.
     962  
     963          Raises a LookupError in case the encoding cannot be found.
     964  
     965      """
     966      return lookup(encoding).encode
     967  
     968  def getdecoder(encoding):
     969  
     970      """ Lookup up the codec for the given encoding and return
     971          its decoder function.
     972  
     973          Raises a LookupError in case the encoding cannot be found.
     974  
     975      """
     976      return lookup(encoding).decode
     977  
     978  def getincrementalencoder(encoding):
     979  
     980      """ Lookup up the codec for the given encoding and return
     981          its IncrementalEncoder class or factory function.
     982  
     983          Raises a LookupError in case the encoding cannot be found
     984          or the codecs doesn't provide an incremental encoder.
     985  
     986      """
     987      encoder = lookup(encoding).incrementalencoder
     988      if encoder is None:
     989          raise LookupError(encoding)
     990      return encoder
     991  
     992  def getincrementaldecoder(encoding):
     993  
     994      """ Lookup up the codec for the given encoding and return
     995          its IncrementalDecoder class or factory function.
     996  
     997          Raises a LookupError in case the encoding cannot be found
     998          or the codecs doesn't provide an incremental decoder.
     999  
    1000      """
    1001      decoder = lookup(encoding).incrementaldecoder
    1002      if decoder is None:
    1003          raise LookupError(encoding)
    1004      return decoder
    1005  
    1006  def getreader(encoding):
    1007  
    1008      """ Lookup up the codec for the given encoding and return
    1009          its StreamReader class or factory function.
    1010  
    1011          Raises a LookupError in case the encoding cannot be found.
    1012  
    1013      """
    1014      return lookup(encoding).streamreader
    1015  
    1016  def getwriter(encoding):
    1017  
    1018      """ Lookup up the codec for the given encoding and return
    1019          its StreamWriter class or factory function.
    1020  
    1021          Raises a LookupError in case the encoding cannot be found.
    1022  
    1023      """
    1024      return lookup(encoding).streamwriter
    1025  
    1026  def iterencode(iterator, encoding, errors='strict', **kwargs):
    1027      """
    1028      Encoding iterator.
    1029  
    1030      Encodes the input strings from the iterator using an IncrementalEncoder.
    1031  
    1032      errors and kwargs are passed through to the IncrementalEncoder
    1033      constructor.
    1034      """
    1035      encoder = getincrementalencoder(encoding)(errors, **kwargs)
    1036      for input in iterator:
    1037          output = encoder.encode(input)
    1038          if output:
    1039              yield output
    1040      output = encoder.encode("", True)
    1041      if output:
    1042          yield output
    1043  
    1044  def iterdecode(iterator, encoding, errors='strict', **kwargs):
    1045      """
    1046      Decoding iterator.
    1047  
    1048      Decodes the input strings from the iterator using an IncrementalDecoder.
    1049  
    1050      errors and kwargs are passed through to the IncrementalDecoder
    1051      constructor.
    1052      """
    1053      decoder = getincrementaldecoder(encoding)(errors, **kwargs)
    1054      for input in iterator:
    1055          output = decoder.decode(input)
    1056          if output:
    1057              yield output
    1058      output = decoder.decode(b"", True)
    1059      if output:
    1060          yield output
    1061  
    1062  ### Helpers for charmap-based codecs
    1063  
    1064  def make_identity_dict(rng):
    1065  
    1066      """ make_identity_dict(rng) -> dict
    1067  
    1068          Return a dictionary where elements of the rng sequence are
    1069          mapped to themselves.
    1070  
    1071      """
    1072      return {i:i for i in rng}
    1073  
    1074  def make_encoding_map(decoding_map):
    1075  
    1076      """ Creates an encoding map from a decoding map.
    1077  
    1078          If a target mapping in the decoding map occurs multiple
    1079          times, then that target is mapped to None (undefined mapping),
    1080          causing an exception when encountered by the charmap codec
    1081          during translation.
    1082  
    1083          One example where this happens is cp875.py which decodes
    1084          multiple character to \\u001a.
    1085  
    1086      """
    1087      m = {}
    1088      for k,v in decoding_map.items():
    1089          if not v in m:
    1090              m[v] = k
    1091          else:
    1092              m[v] = None
    1093      return m
    1094  
    1095  ### error handlers
    1096  
    1097  try:
    1098      strict_errors = lookup_error("strict")
    1099      ignore_errors = lookup_error("ignore")
    1100      replace_errors = lookup_error("replace")
    1101      xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
    1102      backslashreplace_errors = lookup_error("backslashreplace")
    1103      namereplace_errors = lookup_error("namereplace")
    1104  except LookupError:
    1105      # In --disable-unicode builds, these error handler are missing
    1106      strict_errors = None
    1107      ignore_errors = None
    1108      replace_errors = None
    1109      xmlcharrefreplace_errors = None
    1110      backslashreplace_errors = None
    1111      namereplace_errors = None
    1112  
    1113  # Tell modulefinder that using codecs probably needs the encodings
    1114  # package
    1115  _false = 0
    1116  if _false:
    1117      import encodings