1  """ Python 'utf-8-sig' Codec
       2  This work similar to UTF-8 with the following changes:
       3  
       4  * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
       5    first three bytes.
       6  
       7  * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
       8    bytes will be skipped.
       9  """
      10  import codecs
      11  
      12  ### Codec APIs
      13  
      14  def encode(input, errors='strict'):
      15      return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
      16              len(input))
      17  
      18  def decode(input, errors='strict'):
      19      prefix = 0
      20      if input[:3] == codecs.BOM_UTF8:
      21          input = input[3:]
      22          prefix = 3
      23      (output, consumed) = codecs.utf_8_decode(input, errors, True)
      24      return (output, consumed+prefix)
      25  
      26  class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mIncrementalEncoder):
      27      def __init__(self, errors='strict'):
      28          codecs.IncrementalEncoder.__init__(self, errors)
      29          self.first = 1
      30  
      31      def encode(self, input, final=False):
      32          if self.first:
      33              self.first = 0
      34              return codecs.BOM_UTF8 + \
      35                     codecs.utf_8_encode(input, self.errors)[0]
      36          else:
      37              return codecs.utf_8_encode(input, self.errors)[0]
      38  
      39      def reset(self):
      40          codecs.IncrementalEncoder.reset(self)
      41          self.first = 1
      42  
      43      def getstate(self):
      44          return self.first
      45  
      46      def setstate(self, state):
      47          self.first = state
      48  
      49  class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mBufferedIncrementalDecoder):
      50      def __init__(self, errors='strict'):
      51          codecs.BufferedIncrementalDecoder.__init__(self, errors)
      52          self.first = 1
      53  
      54      def _buffer_decode(self, input, errors, final):
      55          if self.first:
      56              if len(input) < 3:
      57                  if codecs.BOM_UTF8.startswith(input):
      58                      # not enough data to decide if this really is a BOM
      59                      # => try again on the next call
      60                      return ("", 0)
      61                  else:
      62                      self.first = 0
      63              else:
      64                  self.first = 0
      65                  if input[:3] == codecs.BOM_UTF8:
      66                      (output, consumed) = \
      67                         codecs.utf_8_decode(input[3:], errors, final)
      68                      return (output, consumed+3)
      69          return codecs.utf_8_decode(input, errors, final)
      70  
      71      def reset(self):
      72          codecs.BufferedIncrementalDecoder.reset(self)
      73          self.first = 1
      74  
      75      def getstate(self):
      76          state = codecs.BufferedIncrementalDecoder.getstate(self)
      77          # state[1] must be 0 here, as it isn't passed along to the caller
      78          return (state[0], self.first)
      79  
      80      def setstate(self, state):
      81          # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
      82          codecs.BufferedIncrementalDecoder.setstate(self, state)
      83          self.first = state[1]
      84  
      85  class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamWriter):
      86      def reset(self):
      87          codecs.StreamWriter.reset(self)
      88          try:
      89              del self.encode
      90          except AttributeError:
      91              pass
      92  
      93      def encode(self, input, errors='strict'):
      94          self.encode = codecs.utf_8_encode
      95          return encode(input, errors)
      96  
      97  class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamReader):
      98      def reset(self):
      99          codecs.StreamReader.reset(self)
     100          try:
     101              del self.decode
     102          except AttributeError:
     103              pass
     104  
     105      def decode(self, input, errors='strict'):
     106          if len(input) < 3:
     107              if codecs.BOM_UTF8.startswith(input):
     108                  # not enough data to decide if this is a BOM
     109                  # => try again on the next call
     110                  return ("", 0)
     111          elif input[:3] == codecs.BOM_UTF8:
     112              self.decode = codecs.utf_8_decode
     113              (output, consumed) = codecs.utf_8_decode(input[3:],errors)
     114              return (output, consumed+3)
     115          # (else) no BOM present
     116          self.decode = codecs.utf_8_decode
     117          return codecs.utf_8_decode(input, errors)
     118  
     119  ### encodings module API
     120  
     121  def getregentry():
     122      return codecs.CodecInfo(
     123          name='utf-8-sig',
     124          encode=encode,
     125          decode=decode,
     126          incrementalencoder=IncrementalEncoder,
     127          incrementaldecoder=IncrementalDecoder,
     128          streamreader=StreamReader,
     129          streamwriter=StreamWriter,
     130      )