1 """ Python 'utf-8-sig' Codec
2 This work similar to UTF-8 with the following changes:
3
4 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5 first three bytes.
6
7 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8 bytes will be skipped.
9 """
10 import codecs
11
12 ### Codec APIs
13
14 def encode(input, errors='strict'):
15 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
16 len(input))
17
18 def decode(input, errors='strict'):
19 prefix = 0
20 if input[:3] == codecs.BOM_UTF8:
21 input = input[3:]
22 prefix = 3
23 (output, consumed) = codecs.utf_8_decode(input, errors, True)
24 return (output, consumed+prefix)
25
26 class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mIncrementalEncoder):
27 def __init__(self, errors='strict'):
28 codecs.IncrementalEncoder.__init__(self, errors)
29 self.first = 1
30
31 def encode(self, input, final=False):
32 if self.first:
33 self.first = 0
34 return codecs.BOM_UTF8 + \
35 codecs.utf_8_encode(input, self.errors)[0]
36 else:
37 return codecs.utf_8_encode(input, self.errors)[0]
38
39 def reset(self):
40 codecs.IncrementalEncoder.reset(self)
41 self.first = 1
42
43 def getstate(self):
44 return self.first
45
46 def setstate(self, state):
47 self.first = state
48
49 class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mBufferedIncrementalDecoder):
50 def __init__(self, errors='strict'):
51 codecs.BufferedIncrementalDecoder.__init__(self, errors)
52 self.first = 1
53
54 def _buffer_decode(self, input, errors, final):
55 if self.first:
56 if len(input) < 3:
57 if codecs.BOM_UTF8.startswith(input):
58 # not enough data to decide if this really is a BOM
59 # => try again on the next call
60 return ("", 0)
61 else:
62 self.first = 0
63 else:
64 self.first = 0
65 if input[:3] == codecs.BOM_UTF8:
66 (output, consumed) = \
67 codecs.utf_8_decode(input[3:], errors, final)
68 return (output, consumed+3)
69 return codecs.utf_8_decode(input, errors, final)
70
71 def reset(self):
72 codecs.BufferedIncrementalDecoder.reset(self)
73 self.first = 1
74
75 def getstate(self):
76 state = codecs.BufferedIncrementalDecoder.getstate(self)
77 # state[1] must be 0 here, as it isn't passed along to the caller
78 return (state[0], self.first)
79
80 def setstate(self, state):
81 # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
82 codecs.BufferedIncrementalDecoder.setstate(self, state)
83 self.first = state[1]
84
85 class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamWriter):
86 def reset(self):
87 codecs.StreamWriter.reset(self)
88 try:
89 del self.encode
90 except AttributeError:
91 pass
92
93 def encode(self, input, errors='strict'):
94 self.encode = codecs.utf_8_encode
95 return encode(input, errors)
96
97 class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mcodecsESC[4;38;5;149m.ESC[4;38;5;149mStreamReader):
98 def reset(self):
99 codecs.StreamReader.reset(self)
100 try:
101 del self.decode
102 except AttributeError:
103 pass
104
105 def decode(self, input, errors='strict'):
106 if len(input) < 3:
107 if codecs.BOM_UTF8.startswith(input):
108 # not enough data to decide if this is a BOM
109 # => try again on the next call
110 return ("", 0)
111 elif input[:3] == codecs.BOM_UTF8:
112 self.decode = codecs.utf_8_decode
113 (output, consumed) = codecs.utf_8_decode(input[3:],errors)
114 return (output, consumed+3)
115 # (else) no BOM present
116 self.decode = codecs.utf_8_decode
117 return codecs.utf_8_decode(input, errors)
118
119 ### encodings module API
120
121 def getregentry():
122 return codecs.CodecInfo(
123 name='utf-8-sig',
124 encode=encode,
125 decode=decode,
126 incrementalencoder=IncrementalEncoder,
127 incrementaldecoder=IncrementalDecoder,
128 streamreader=StreamReader,
129 streamwriter=StreamWriter,
130 )