1  """ codecs -- Python Codec Registry, API and helpers.
       2  
       3  
       4  Written by Marc-Andre Lemburg (mal@lemburg.com).
       5  
       6  (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
       7  
       8  """
       9  
      10  import builtins
      11  import sys
      12  
      13  ### Registry and builtin stateless codec functions
      14  
      15  try:
      16      from _codecs import *
      17  except ImportError as why:
      18      raise SystemError('Failed to load the builtin codecs: %s' % why)
      19  
      20  __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
      21             "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
      22             "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
      23             "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
      24             "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
      25             "StreamReader", "StreamWriter",
      26             "StreamReaderWriter", "StreamRecoder",
      27             "getencoder", "getdecoder", "getincrementalencoder",
      28             "getincrementaldecoder", "getreader", "getwriter",
      29             "encode", "decode", "iterencode", "iterdecode",
      30             "strict_errors", "ignore_errors", "replace_errors",
      31             "xmlcharrefreplace_errors",
      32             "backslashreplace_errors", "namereplace_errors",
      33             "register_error", "lookup_error"]
      34  
      35  ### Constants
      36  
      37  #
      38  # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
      39  # and its possible byte string values
      40  # for UTF8/UTF16/UTF32 output and little/big endian machines
      41  #
      42  
      43  # UTF-8
      44  BOM_UTF8 = b'\xef\xbb\xbf'
      45  
      46  # UTF-16, little endian
      47  BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
      48  
      49  # UTF-16, big endian
      50  BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
      51  
      52  # UTF-32, little endian
      53  BOM_UTF32_LE = b'\xff\xfe\x00\x00'
      54  
      55  # UTF-32, big endian
      56  BOM_UTF32_BE = b'\x00\x00\xfe\xff'
      57  
      58  if sys.byteorder == 'little':
      59  
      60      # UTF-16, native endianness
      61      BOM = BOM_UTF16 = BOM_UTF16_LE
      62  
      63      # UTF-32, native endianness
      64      BOM_UTF32 = BOM_UTF32_LE
      65  
      66  else:
      67  
      68      # UTF-16, native endianness
      69      BOM = BOM_UTF16 = BOM_UTF16_BE
      70  
      71      # UTF-32, native endianness
      72      BOM_UTF32 = BOM_UTF32_BE
      73  
      74  # Old broken names (don't use in new code)
      75  BOM32_LE = BOM_UTF16_LE
      76  BOM32_BE = BOM_UTF16_BE
      77  BOM64_LE = BOM_UTF32_LE
      78  BOM64_BE = BOM_UTF32_BE
      79  
      80  
      81  ### Codec base classes (defining the API)
      82  
      83  class ESC[4;38;5;81mCodecInfo(ESC[4;38;5;149mtuple):
      84      """Codec details when looking up the codec registry"""
      85  
      86      # Private API to allow Python 3.4 to denylist the known non-Unicode
      87      # codecs in the standard library. A more general mechanism to
      88      # reliably distinguish test encodings from other codecs will hopefully
      89      # be defined for Python 3.5
      90      #
      91      # See http://bugs.python.org/issue19619
      92      _is_text_encoding = True # Assume codecs are text encodings by default
      93  
      94      def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
      95          incrementalencoder=None, incrementaldecoder=None, name=None,
      96          *, _is_text_encoding=None):
      97          self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
      98          self.name = name
      99          self.encode = encode
     100          self.decode = decode
     101          self.incrementalencoder = incrementalencoder
     102          self.incrementaldecoder = incrementaldecoder
     103          self.streamwriter = streamwriter
     104          self.streamreader = streamreader
     105          if _is_text_encoding is not None:
     106              self._is_text_encoding = _is_text_encoding
     107          return self
     108  
     109      def __repr__(self):
     110          return "<%s.%s object for encoding %s at %#x>" % \
     111                  (self.__class__.__module__, self.__class__.__qualname__,
     112                   self.name, id(self))
     113  
     114  class ESC[4;38;5;81mCodec:
     115  
     116      """ Defines the interface for stateless encoders/decoders.
     117  
     118          The .encode()/.decode() methods may use different error
     119          handling schemes by providing the errors argument. These
     120          string values are predefined:
     121  
     122           'strict' - raise a ValueError error (or a subclass)
     123           'ignore' - ignore the character and continue with the next
     124           'replace' - replace with a suitable replacement character;
     125                      Python will use the official U+FFFD REPLACEMENT
     126                      CHARACTER for the builtin Unicode codecs on
     127                      decoding and '?' on encoding.
     128           'surrogateescape' - replace with private code points U+DCnn.
     129           'xmlcharrefreplace' - Replace with the appropriate XML
     130                                 character reference (only for encoding).
     131           'backslashreplace'  - Replace with backslashed escape sequences.
     132           'namereplace'       - Replace with \\N{...} escape sequences
     133                                 (only for encoding).
     134  
     135          The set of allowed values can be extended via register_error.
     136  
     137      """
     138      def encode(self, input, errors='strict'):
     139  
     140          """ Encodes the object input and returns a tuple (output
     141              object, length consumed).
     142  
     143              errors defines the error handling to apply. It defaults to
     144              'strict' handling.
     145  
     146              The method may not store state in the Codec instance. Use
     147              StreamWriter for codecs which have to keep state in order to
     148              make encoding efficient.
     149  
     150              The encoder must be able to handle zero length input and
     151              return an empty object of the output object type in this
     152              situation.
     153  
     154          """
     155          raise NotImplementedError
     156  
     157      def decode(self, input, errors='strict'):
     158  
     159          """ Decodes the object input and returns a tuple (output
     160              object, length consumed).
     161  
     162              input must be an object which provides the bf_getreadbuf
     163              buffer slot. Python strings, buffer objects and memory
     164              mapped files are examples of objects providing this slot.
     165  
     166              errors defines the error handling to apply. It defaults to
     167              'strict' handling.
     168  
     169              The method may not store state in the Codec instance. Use
     170              StreamReader for codecs which have to keep state in order to
     171              make decoding efficient.
     172  
     173              The decoder must be able to handle zero length input and
     174              return an empty object of the output object type in this
     175              situation.
     176  
     177          """
     178          raise NotImplementedError
     179  
     180  class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mobject):
     181      """
     182      An IncrementalEncoder encodes an input in multiple steps. The input can
     183      be passed piece by piece to the encode() method. The IncrementalEncoder
     184      remembers the state of the encoding process between calls to encode().
     185      """
     186      def __init__(self, errors='strict'):
     187          """
     188          Creates an IncrementalEncoder instance.
     189  
     190          The IncrementalEncoder may use different error handling schemes by
     191          providing the errors keyword argument. See the module docstring
     192          for a list of possible values.
     193          """
     194          self.errors = errors
     195          self.buffer = ""
     196  
     197      def encode(self, input, final=False):
     198          """
     199          Encodes input and returns the resulting object.
     200          """
     201          raise NotImplementedError
     202  
     203      def reset(self):
     204          """
     205          Resets the encoder to the initial state.
     206          """
     207  
     208      def getstate(self):
     209          """
     210          Return the current state of the encoder.
     211          """
     212          return 0
     213  
     214      def setstate(self, state):
     215          """
     216          Set the current state of the encoder. state must have been
     217          returned by getstate().
     218          """
     219  
     220  class ESC[4;38;5;81mBufferedIncrementalEncoder(ESC[4;38;5;149mIncrementalEncoder):
     221      """
     222      This subclass of IncrementalEncoder can be used as the baseclass for an
     223      incremental encoder if the encoder must keep some of the output in a
     224      buffer between calls to encode().
     225      """
     226      def __init__(self, errors='strict'):
     227          IncrementalEncoder.__init__(self, errors)
     228          # unencoded input that is kept between calls to encode()
     229          self.buffer = ""
     230  
     231      def _buffer_encode(self, input, errors, final):
     232          # Overwrite this method in subclasses: It must encode input
     233          # and return an (output, length consumed) tuple
     234          raise NotImplementedError
     235  
     236      def encode(self, input, final=False):
     237          # encode input (taking the buffer into account)
     238          data = self.buffer + input
     239          (result, consumed) = self._buffer_encode(data, self.errors, final)
     240          # keep unencoded input until the next call
     241          self.buffer = data[consumed:]
     242          return result
     243  
     244      def reset(self):
     245          IncrementalEncoder.reset(self)
     246          self.buffer = ""
     247  
     248      def getstate(self):
     249          return self.buffer or 0
     250  
     251      def setstate(self, state):
     252          self.buffer = state or ""
     253  
     254  class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mobject):
     255      """
     256      An IncrementalDecoder decodes an input in multiple steps. The input can
     257      be passed piece by piece to the decode() method. The IncrementalDecoder
     258      remembers the state of the decoding process between calls to decode().
     259      """
     260      def __init__(self, errors='strict'):
     261          """
     262          Create an IncrementalDecoder instance.
     263  
     264          The IncrementalDecoder may use different error handling schemes by
     265          providing the errors keyword argument. See the module docstring
     266          for a list of possible values.
     267          """
     268          self.errors = errors
     269  
     270      def decode(self, input, final=False):
     271          """
     272          Decode input and returns the resulting object.
     273          """
     274          raise NotImplementedError
     275  
     276      def reset(self):
     277          """
     278          Reset the decoder to the initial state.
     279          """
     280  
     281      def getstate(self):
     282          """
     283          Return the current state of the decoder.
     284  
     285          This must be a (buffered_input, additional_state_info) tuple.
     286          buffered_input must be a bytes object containing bytes that
     287          were passed to decode() that have not yet been converted.
     288          additional_state_info must be a non-negative integer
     289          representing the state of the decoder WITHOUT yet having
     290          processed the contents of buffered_input.  In the initial state
     291          and after reset(), getstate() must return (b"", 0).
     292          """
     293          return (b"", 0)
     294  
     295      def setstate(self, state):
     296          """
     297          Set the current state of the decoder.
     298  
     299          state must have been returned by getstate().  The effect of
     300          setstate((b"", 0)) must be equivalent to reset().
     301          """
     302  
     303  class ESC[4;38;5;81mBufferedIncrementalDecoder(ESC[4;38;5;149mIncrementalDecoder):
     304      """
     305      This subclass of IncrementalDecoder can be used as the baseclass for an
     306      incremental decoder if the decoder must be able to handle incomplete
     307      byte sequences.
     308      """
     309      def __init__(self, errors='strict'):
     310          IncrementalDecoder.__init__(self, errors)
     311          # undecoded input that is kept between calls to decode()
     312          self.buffer = b""
     313  
     314      def _buffer_decode(self, input, errors, final):
     315          # Overwrite this method in subclasses: It must decode input
     316          # and return an (output, length consumed) tuple
     317          raise NotImplementedError
     318  
     319      def decode(self, input, final=False):
     320          # decode input (taking the buffer into account)
     321          data = self.buffer + input
     322          (result, consumed) = self._buffer_decode(data, self.errors, final)
     323          # keep undecoded input until the next call
     324          self.buffer = data[consumed:]
     325          return result
     326  
     327      def reset(self):
     328          IncrementalDecoder.reset(self)
     329          self.buffer = b""
     330  
     331      def getstate(self):
     332          # additional state info is always 0
     333          return (self.buffer, 0)
     334  
     335      def setstate(self, state):
     336          # ignore additional state info
     337          self.buffer = state[0]
     338  
     339  #
     340  # The StreamWriter and StreamReader class provide generic working
     341  # interfaces which can be used to implement new encoding submodules
     342  # very easily. See encodings/utf_8.py for an example on how this is
     343  # done.
     344  #
     345  
     346  class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mCodec):
     347  
     348      def __init__(self, stream, errors='strict'):
     349  
     350          """ Creates a StreamWriter instance.
     351  
     352              stream must be a file-like object open for writing.
     353  
     354              The StreamWriter may use different error handling
     355              schemes by providing the errors keyword argument. These
     356              parameters are predefined:
     357  
     358               'strict' - raise a ValueError (or a subclass)
     359               'ignore' - ignore the character and continue with the next
     360               'replace'- replace with a suitable replacement character
     361               'xmlcharrefreplace' - Replace with the appropriate XML
     362                                     character reference.
     363               'backslashreplace'  - Replace with backslashed escape
     364                                     sequences.
     365               'namereplace'       - Replace with \\N{...} escape sequences.
     366  
     367              The set of allowed parameter values can be extended via
     368              register_error.
     369          """
     370          self.stream = stream
     371          self.errors = errors
     372  
     373      def write(self, object):
     374  
     375          """ Writes the object's contents encoded to self.stream.
     376          """
     377          data, consumed = self.encode(object, self.errors)
     378          self.stream.write(data)
     379  
     380      def writelines(self, list):
     381  
     382          """ Writes the concatenated list of strings to the stream
     383              using .write().
     384          """
     385          self.write(''.join(list))
     386  
     387      def reset(self):
     388  
     389          """ Resets the codec buffers used for keeping internal state.
     390  
     391              Calling this method should ensure that the data on the
     392              output is put into a clean state, that allows appending
     393              of new fresh data without having to rescan the whole
     394              stream to recover state.
     395  
     396          """
     397          pass
     398  
     399      def seek(self, offset, whence=0):
     400          self.stream.seek(offset, whence)
     401          if whence == 0 and offset == 0:
     402              self.reset()
     403  
     404      def __getattr__(self, name,
     405                      getattr=getattr):
     406  
     407          """ Inherit all other methods from the underlying stream.
     408          """
     409          return getattr(self.stream, name)
     410  
     411      def __enter__(self):
     412          return self
     413  
     414      def __exit__(self, type, value, tb):
     415          self.stream.close()
     416  
     417      def __reduce_ex__(self, proto):
     418          raise TypeError("can't serialize %s" % self.__class__.__name__)
     419  
     420  ###
     421  
     422  class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mCodec):
     423  
     424      charbuffertype = str
     425  
     426      def __init__(self, stream, errors='strict'):
     427  
     428          """ Creates a StreamReader instance.
     429  
     430              stream must be a file-like object open for reading.
     431  
     432              The StreamReader may use different error handling
     433              schemes by providing the errors keyword argument. These
     434              parameters are predefined:
     435  
     436               'strict' - raise a ValueError (or a subclass)
     437               'ignore' - ignore the character and continue with the next
     438               'replace'- replace with a suitable replacement character
     439               'backslashreplace' - Replace with backslashed escape sequences;
     440  
     441              The set of allowed parameter values can be extended via
     442              register_error.
     443          """
     444          self.stream = stream
     445          self.errors = errors
     446          self.bytebuffer = b""
     447          self._empty_charbuffer = self.charbuffertype()
     448          self.charbuffer = self._empty_charbuffer
     449          self.linebuffer = None
     450  
     451      def decode(self, input, errors='strict'):
     452          raise NotImplementedError
     453  
     454      def read(self, size=-1, chars=-1, firstline=False):
     455  
     456          """ Decodes data from the stream self.stream and returns the
     457              resulting object.
     458  
     459              chars indicates the number of decoded code points or bytes to
     460              return. read() will never return more data than requested,
     461              but it might return less, if there is not enough available.
     462  
     463              size indicates the approximate maximum number of decoded
     464              bytes or code points to read for decoding. The decoder
     465              can modify this setting as appropriate. The default value
     466              -1 indicates to read and decode as much as possible.  size
     467              is intended to prevent having to decode huge files in one
     468              step.
     469  
     470              If firstline is true, and a UnicodeDecodeError happens
     471              after the first line terminator in the input only the first line
     472              will be returned, the rest of the input will be kept until the
     473              next call to read().
     474  
     475              The method should use a greedy read strategy, meaning that
     476              it should read as much data as is allowed within the
     477              definition of the encoding and the given size, e.g.  if
     478              optional encoding endings or state markers are available
     479              on the stream, these should be read too.
     480          """
     481          # If we have lines cached, first merge them back into characters
     482          if self.linebuffer:
     483              self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
     484              self.linebuffer = None
     485  
     486          if chars < 0:
     487              # For compatibility with other read() methods that take a
     488              # single argument
     489              chars = size
     490  
     491          # read until we get the required number of characters (if available)
     492          while True:
     493              # can the request be satisfied from the character buffer?
     494              if chars >= 0:
     495                  if len(self.charbuffer) >= chars:
     496                      break
     497              # we need more data
     498              if size < 0:
     499                  newdata = self.stream.read()
     500              else:
     501                  newdata = self.stream.read(size)
     502              # decode bytes (those remaining from the last call included)
     503              data = self.bytebuffer + newdata
     504              if not data:
     505                  break
     506              try:
     507                  newchars, decodedbytes = self.decode(data, self.errors)
     508              except UnicodeDecodeError as exc:
     509                  if firstline:
     510                      newchars, decodedbytes = \
     511                          self.decode(data[:exc.start], self.errors)
     512                      lines = newchars.splitlines(keepends=True)
     513                      if len(lines)<=1:
     514                          raise
     515                  else:
     516                      raise
     517              # keep undecoded bytes until the next call
     518              self.bytebuffer = data[decodedbytes:]
     519              # put new characters in the character buffer
     520              self.charbuffer += newchars
     521              # there was no data available
     522              if not newdata:
     523                  break
     524          if chars < 0:
     525              # Return everything we've got
     526              result = self.charbuffer
     527              self.charbuffer = self._empty_charbuffer
     528          else:
     529              # Return the first chars characters
     530              result = self.charbuffer[:chars]
     531              self.charbuffer = self.charbuffer[chars:]
     532          return result
     533  
     534      def readline(self, size=None, keepends=True):
     535  
     536          """ Read one line from the input stream and return the
     537              decoded data.
     538  
     539              size, if given, is passed as size argument to the
     540              read() method.
     541  
     542          """
     543          # If we have lines cached from an earlier read, return
     544          # them unconditionally
     545          if self.linebuffer:
     546              line = self.linebuffer[0]
     547              del self.linebuffer[0]
     548              if len(self.linebuffer) == 1:
     549                  # revert to charbuffer mode; we might need more data
     550                  # next time
     551                  self.charbuffer = self.linebuffer[0]
     552                  self.linebuffer = None
     553              if not keepends:
     554                  line = line.splitlines(keepends=False)[0]
     555              return line
     556  
     557          readsize = size or 72
     558          line = self._empty_charbuffer
     559          # If size is given, we call read() only once
     560          while True:
     561              data = self.read(readsize, firstline=True)
     562              if data:
     563                  # If we're at a "\r" read one extra character (which might
     564                  # be a "\n") to get a proper line ending. If the stream is
     565                  # temporarily exhausted we return the wrong line ending.
     566                  if (isinstance(data, str) and data.endswith("\r")) or \
     567                     (isinstance(data, bytes) and data.endswith(b"\r")):
     568                      data += self.read(size=1, chars=1)
     569  
     570              line += data
     571              lines = line.splitlines(keepends=True)
     572              if lines:
     573                  if len(lines) > 1:
     574                      # More than one line result; the first line is a full line
     575                      # to return
     576                      line = lines[0]
     577                      del lines[0]
     578                      if len(lines) > 1:
     579                          # cache the remaining lines
     580                          lines[-1] += self.charbuffer
     581                          self.linebuffer = lines
     582                          self.charbuffer = None
     583                      else:
     584                          # only one remaining line, put it back into charbuffer
     585                          self.charbuffer = lines[0] + self.charbuffer
     586                      if not keepends:
     587                          line = line.splitlines(keepends=False)[0]
     588                      break
     589                  line0withend = lines[0]
     590                  line0withoutend = lines[0].splitlines(keepends=False)[0]
     591                  if line0withend != line0withoutend: # We really have a line end
     592                      # Put the rest back together and keep it until the next call
     593                      self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
     594                                        self.charbuffer
     595                      if keepends:
     596                          line = line0withend
     597                      else:
     598                          line = line0withoutend
     599                      break
     600              # we didn't get anything or this was our only try
     601              if not data or size is not None:
     602                  if line and not keepends:
     603                      line = line.splitlines(keepends=False)[0]
     604                  break
     605              if readsize < 8000:
     606                  readsize *= 2
     607          return line
     608  
     609      def readlines(self, sizehint=None, keepends=True):
     610  
     611          """ Read all lines available on the input stream
     612              and return them as a list.
     613  
     614              Line breaks are implemented using the codec's decoder
     615              method and are included in the list entries.
     616  
     617              sizehint, if given, is ignored since there is no efficient
     618              way to finding the true end-of-line.
     619  
     620          """
     621          data = self.read()
     622          return data.splitlines(keepends)
     623  
     624      def reset(self):
     625  
     626          """ Resets the codec buffers used for keeping internal state.
     627  
     628              Note that no stream repositioning should take place.
     629              This method is primarily intended to be able to recover
     630              from decoding errors.
     631  
     632          """
     633          self.bytebuffer = b""
     634          self.charbuffer = self._empty_charbuffer
     635          self.linebuffer = None
     636  
     637      def seek(self, offset, whence=0):
     638          """ Set the input stream's current position.
     639  
     640              Resets the codec buffers used for keeping state.
     641          """
     642          self.stream.seek(offset, whence)
     643          self.reset()
     644  
     645      def __next__(self):
     646  
     647          """ Return the next decoded line from the input stream."""
     648          line = self.readline()
     649          if line:
     650              return line
     651          raise StopIteration
     652  
     653      def __iter__(self):
     654          return self
     655  
     656      def __getattr__(self, name,
     657                      getattr=getattr):
     658  
     659          """ Inherit all other methods from the underlying stream.
     660          """
     661          return getattr(self.stream, name)
     662  
     663      def __enter__(self):
     664          return self
     665  
     666      def __exit__(self, type, value, tb):
     667          self.stream.close()
     668  
     669      def __reduce_ex__(self, proto):
     670          raise TypeError("can't serialize %s" % self.__class__.__name__)
     671  
     672  ###
     673  
     674  class ESC[4;38;5;81mStreamReaderWriter:
     675  
     676      """ StreamReaderWriter instances allow wrapping streams which
     677          work in both read and write modes.
     678  
     679          The design is such that one can use the factory functions
     680          returned by the codec.lookup() function to construct the
     681          instance.
     682  
     683      """
     684      # Optional attributes set by the file wrappers below
     685      encoding = 'unknown'
     686  
     687      def __init__(self, stream, Reader, Writer, errors='strict'):
     688  
     689          """ Creates a StreamReaderWriter instance.
     690  
     691              stream must be a Stream-like object.
     692  
     693              Reader, Writer must be factory functions or classes
     694              providing the StreamReader, StreamWriter interface resp.
     695  
     696              Error handling is done in the same way as defined for the
     697              StreamWriter/Readers.
     698  
     699          """
     700          self.stream = stream
     701          self.reader = Reader(stream, errors)
     702          self.writer = Writer(stream, errors)
     703          self.errors = errors
     704  
     705      def read(self, size=-1):
     706  
     707          return self.reader.read(size)
     708  
     709      def readline(self, size=None):
     710  
     711          return self.reader.readline(size)
     712  
     713      def readlines(self, sizehint=None):
     714  
     715          return self.reader.readlines(sizehint)
     716  
     717      def __next__(self):
     718  
     719          """ Return the next decoded line from the input stream."""
     720          return next(self.reader)
     721  
     722      def __iter__(self):
     723          return self
     724  
     725      def write(self, data):
     726  
     727          return self.writer.write(data)
     728  
     729      def writelines(self, list):
     730  
     731          return self.writer.writelines(list)
     732  
     733      def reset(self):
     734  
     735          self.reader.reset()
     736          self.writer.reset()
     737  
     738      def seek(self, offset, whence=0):
     739          self.stream.seek(offset, whence)
     740          self.reader.reset()
     741          if whence == 0 and offset == 0:
     742              self.writer.reset()
     743  
     744      def __getattr__(self, name,
     745                      getattr=getattr):
     746  
     747          """ Inherit all other methods from the underlying stream.
     748          """
     749          return getattr(self.stream, name)
     750  
     751      # these are needed to make "with StreamReaderWriter(...)" work properly
     752  
     753      def __enter__(self):
     754          return self
     755  
     756      def __exit__(self, type, value, tb):
     757          self.stream.close()
     758  
     759      def __reduce_ex__(self, proto):
     760          raise TypeError("can't serialize %s" % self.__class__.__name__)
     761  
     762  ###
     763  
     764  class ESC[4;38;5;81mStreamRecoder:
     765  
     766      """ StreamRecoder instances translate data from one encoding to another.
     767  
     768          They use the complete set of APIs returned by the
     769          codecs.lookup() function to implement their task.
     770  
     771          Data written to the StreamRecoder is first decoded into an
     772          intermediate format (depending on the "decode" codec) and then
     773          written to the underlying stream using an instance of the provided
     774          Writer class.
     775  
     776          In the other direction, data is read from the underlying stream using
     777          a Reader instance and then encoded and returned to the caller.
     778  
     779      """
     780      # Optional attributes set by the file wrappers below
     781      data_encoding = 'unknown'
     782      file_encoding = 'unknown'
     783  
     784      def __init__(self, stream, encode, decode, Reader, Writer,
     785                   errors='strict'):
     786  
     787          """ Creates a StreamRecoder instance which implements a two-way
     788              conversion: encode and decode work on the frontend (the
     789              data visible to .read() and .write()) while Reader and Writer
     790              work on the backend (the data in stream).
     791  
     792              You can use these objects to do transparent
     793              transcodings from e.g. latin-1 to utf-8 and back.
     794  
     795              stream must be a file-like object.
     796  
     797              encode and decode must adhere to the Codec interface; Reader and
     798              Writer must be factory functions or classes providing the
     799              StreamReader and StreamWriter interfaces resp.
     800  
     801              Error handling is done in the same way as defined for the
     802              StreamWriter/Readers.
     803  
     804          """
     805          self.stream = stream
     806          self.encode = encode
     807          self.decode = decode
     808          self.reader = Reader(stream, errors)
     809          self.writer = Writer(stream, errors)
     810          self.errors = errors
     811  
     812      def read(self, size=-1):
     813  
     814          data = self.reader.read(size)
     815          data, bytesencoded = self.encode(data, self.errors)
     816          return data
     817  
     818      def readline(self, size=None):
     819  
     820          if size is None:
     821              data = self.reader.readline()
     822          else:
     823              data = self.reader.readline(size)
     824          data, bytesencoded = self.encode(data, self.errors)
     825          return data
     826  
     827      def readlines(self, sizehint=None):
     828  
     829          data = self.reader.read()
     830          data, bytesencoded = self.encode(data, self.errors)
     831          return data.splitlines(keepends=True)
     832  
     833      def __next__(self):
     834  
     835          """ Return the next decoded line from the input stream."""
     836          data = next(self.reader)
     837          data, bytesencoded = self.encode(data, self.errors)
     838          return data
     839  
     840      def __iter__(self):
     841          return self
     842  
     843      def write(self, data):
     844  
     845          data, bytesdecoded = self.decode(data, self.errors)
     846          return self.writer.write(data)
     847  
     848      def writelines(self, list):
     849  
     850          data = b''.join(list)
     851          data, bytesdecoded = self.decode(data, self.errors)
     852          return self.writer.write(data)
     853  
     854      def reset(self):
     855  
     856          self.reader.reset()
     857          self.writer.reset()
     858  
     859      def seek(self, offset, whence=0):
     860          # Seeks must be propagated to both the readers and writers
     861          # as they might need to reset their internal buffers.
     862          self.reader.seek(offset, whence)
     863          self.writer.seek(offset, whence)
     864  
     865      def __getattr__(self, name,
     866                      getattr=getattr):
     867  
     868          """ Inherit all other methods from the underlying stream.
     869          """
     870          return getattr(self.stream, name)
     871  
     872      def __enter__(self):
     873          return self
     874  
     875      def __exit__(self, type, value, tb):
     876          self.stream.close()
     877  
     878      def __reduce_ex__(self, proto):
     879          raise TypeError("can't serialize %s" % self.__class__.__name__)
     880  
     881  ### Shortcuts
     882  
     883  def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
     884  
     885      """ Open an encoded file using the given mode and return
     886          a wrapped version providing transparent encoding/decoding.
     887  
     888          Note: The wrapped version will only accept the object format
     889          defined by the codecs, i.e. Unicode objects for most builtin
     890          codecs. Output is also codec dependent and will usually be
     891          Unicode as well.
     892  
     893          If encoding is not None, then the
     894          underlying encoded files are always opened in binary mode.
     895          The default file mode is 'r', meaning to open the file in read mode.
     896  
     897          encoding specifies the encoding which is to be used for the
     898          file.
     899  
     900          errors may be given to define the error handling. It defaults
     901          to 'strict' which causes ValueErrors to be raised in case an
     902          encoding error occurs.
     903  
     904          buffering has the same meaning as for the builtin open() API.
     905          It defaults to -1 which means that the default buffer size will
     906          be used.
     907  
     908          The returned wrapped file object provides an extra attribute
     909          .encoding which allows querying the used encoding. This
     910          attribute is only available if an encoding was specified as
     911          parameter.
     912  
     913      """
     914      if encoding is not None and \
     915         'b' not in mode:
     916          # Force opening of the file in binary mode
     917          mode = mode + 'b'
     918      file = builtins.open(filename, mode, buffering)
     919      if encoding is None:
     920          return file
     921  
     922      try:
     923          info = lookup(encoding)
     924          srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
     925          # Add attributes to simplify introspection
     926          srw.encoding = encoding
     927          return srw
     928      except:
     929          file.close()
     930          raise
     931  
     932  def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
     933  
     934      """ Return a wrapped version of file which provides transparent
     935          encoding translation.
     936  
     937          Data written to the wrapped file is decoded according
     938          to the given data_encoding and then encoded to the underlying
     939          file using file_encoding. The intermediate data type
     940          will usually be Unicode but depends on the specified codecs.
     941  
     942          Bytes read from the file are decoded using file_encoding and then
     943          passed back to the caller encoded using data_encoding.
     944  
     945          If file_encoding is not given, it defaults to data_encoding.
     946  
     947          errors may be given to define the error handling. It defaults
     948          to 'strict' which causes ValueErrors to be raised in case an
     949          encoding error occurs.
     950  
     951          The returned wrapped file object provides two extra attributes
     952          .data_encoding and .file_encoding which reflect the given
     953          parameters of the same name. The attributes can be used for
     954          introspection by Python programs.
     955  
     956      """
     957      if file_encoding is None:
     958          file_encoding = data_encoding
     959      data_info = lookup(data_encoding)
     960      file_info = lookup(file_encoding)
     961      sr = StreamRecoder(file, data_info.encode, data_info.decode,
     962                         file_info.streamreader, file_info.streamwriter, errors)
     963      # Add attributes to simplify introspection
     964      sr.data_encoding = data_encoding
     965      sr.file_encoding = file_encoding
     966      return sr
     967  
     968  ### Helpers for codec lookup
     969  
     970  def getencoder(encoding):
     971  
     972      """ Lookup up the codec for the given encoding and return
     973          its encoder function.
     974  
     975          Raises a LookupError in case the encoding cannot be found.
     976  
     977      """
     978      return lookup(encoding).encode
     979  
     980  def getdecoder(encoding):
     981  
     982      """ Lookup up the codec for the given encoding and return
     983          its decoder function.
     984  
     985          Raises a LookupError in case the encoding cannot be found.
     986  
     987      """
     988      return lookup(encoding).decode
     989  
     990  def getincrementalencoder(encoding):
     991  
     992      """ Lookup up the codec for the given encoding and return
     993          its IncrementalEncoder class or factory function.
     994  
     995          Raises a LookupError in case the encoding cannot be found
     996          or the codecs doesn't provide an incremental encoder.
     997  
     998      """
     999      encoder = lookup(encoding).incrementalencoder
    1000      if encoder is None:
    1001          raise LookupError(encoding)
    1002      return encoder
    1003  
    1004  def getincrementaldecoder(encoding):
    1005  
    1006      """ Lookup up the codec for the given encoding and return
    1007          its IncrementalDecoder class or factory function.
    1008  
    1009          Raises a LookupError in case the encoding cannot be found
    1010          or the codecs doesn't provide an incremental decoder.
    1011  
    1012      """
    1013      decoder = lookup(encoding).incrementaldecoder
    1014      if decoder is None:
    1015          raise LookupError(encoding)
    1016      return decoder
    1017  
    1018  def getreader(encoding):
    1019  
    1020      """ Lookup up the codec for the given encoding and return
    1021          its StreamReader class or factory function.
    1022  
    1023          Raises a LookupError in case the encoding cannot be found.
    1024  
    1025      """
    1026      return lookup(encoding).streamreader
    1027  
    1028  def getwriter(encoding):
    1029  
    1030      """ Lookup up the codec for the given encoding and return
    1031          its StreamWriter class or factory function.
    1032  
    1033          Raises a LookupError in case the encoding cannot be found.
    1034  
    1035      """
    1036      return lookup(encoding).streamwriter
    1037  
    1038  def iterencode(iterator, encoding, errors='strict', **kwargs):
    1039      """
    1040      Encoding iterator.
    1041  
    1042      Encodes the input strings from the iterator using an IncrementalEncoder.
    1043  
    1044      errors and kwargs are passed through to the IncrementalEncoder
    1045      constructor.
    1046      """
    1047      encoder = getincrementalencoder(encoding)(errors, **kwargs)
    1048      for input in iterator:
    1049          output = encoder.encode(input)
    1050          if output:
    1051              yield output
    1052      output = encoder.encode("", True)
    1053      if output:
    1054          yield output
    1055  
    1056  def iterdecode(iterator, encoding, errors='strict', **kwargs):
    1057      """
    1058      Decoding iterator.
    1059  
    1060      Decodes the input strings from the iterator using an IncrementalDecoder.
    1061  
    1062      errors and kwargs are passed through to the IncrementalDecoder
    1063      constructor.
    1064      """
    1065      decoder = getincrementaldecoder(encoding)(errors, **kwargs)
    1066      for input in iterator:
    1067          output = decoder.decode(input)
    1068          if output:
    1069              yield output
    1070      output = decoder.decode(b"", True)
    1071      if output:
    1072          yield output
    1073  
    1074  ### Helpers for charmap-based codecs
    1075  
    1076  def make_identity_dict(rng):
    1077  
    1078      """ make_identity_dict(rng) -> dict
    1079  
    1080          Return a dictionary where elements of the rng sequence are
    1081          mapped to themselves.
    1082  
    1083      """
    1084      return {i:i for i in rng}
    1085  
    1086  def make_encoding_map(decoding_map):
    1087  
    1088      """ Creates an encoding map from a decoding map.
    1089  
    1090          If a target mapping in the decoding map occurs multiple
    1091          times, then that target is mapped to None (undefined mapping),
    1092          causing an exception when encountered by the charmap codec
    1093          during translation.
    1094  
    1095          One example where this happens is cp875.py which decodes
    1096          multiple character to \\u001a.
    1097  
    1098      """
    1099      m = {}
    1100      for k,v in decoding_map.items():
    1101          if not v in m:
    1102              m[v] = k
    1103          else:
    1104              m[v] = None
    1105      return m
    1106  
    1107  ### error handlers
    1108  
    1109  try:
    1110      strict_errors = lookup_error("strict")
    1111      ignore_errors = lookup_error("ignore")
    1112      replace_errors = lookup_error("replace")
    1113      xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
    1114      backslashreplace_errors = lookup_error("backslashreplace")
    1115      namereplace_errors = lookup_error("namereplace")
    1116  except LookupError:
    1117      # In --disable-unicode builds, these error handler are missing
    1118      strict_errors = None
    1119      ignore_errors = None
    1120      replace_errors = None
    1121      xmlcharrefreplace_errors = None
    1122      backslashreplace_errors = None
    1123      namereplace_errors = None
    1124  
    1125  # Tell modulefinder that using codecs probably needs the encodings
    1126  # package
    1127  _false = 0
    1128  if _false:
    1129      import encodings
    1130  
    1131  ### Tests
    1132  
    1133  if __name__ == '__main__':
    1134  
    1135      # Make stdout translate Latin-1 output into UTF-8 output
    1136      sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
    1137  
    1138      # Have stdin translate Latin-1 input into UTF-8 input
    1139      sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')