1 """ codecs -- Python Codec Registry, API and helpers.
2
3
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8 """
9
10 import builtins
11 import sys
12
13 ### Registry and builtin stateless codec functions
14
15 try:
16 from _codecs import *
17 except ImportError as why:
18 raise SystemError('Failed to load the builtin codecs: %s' % why)
19
20 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25 "StreamReader", "StreamWriter",
26 "StreamReaderWriter", "StreamRecoder",
27 "getencoder", "getdecoder", "getincrementalencoder",
28 "getincrementaldecoder", "getreader", "getwriter",
29 "encode", "decode", "iterencode", "iterdecode",
30 "strict_errors", "ignore_errors", "replace_errors",
31 "xmlcharrefreplace_errors",
32 "backslashreplace_errors", "namereplace_errors",
33 "register_error", "lookup_error"]
34
35 ### Constants
36
37 #
38 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39 # and its possible byte string values
40 # for UTF8/UTF16/UTF32 output and little/big endian machines
41 #
42
43 # UTF-8
44 BOM_UTF8 = b'\xef\xbb\xbf'
45
46 # UTF-16, little endian
47 BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
48
49 # UTF-16, big endian
50 BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
51
52 # UTF-32, little endian
53 BOM_UTF32_LE = b'\xff\xfe\x00\x00'
54
55 # UTF-32, big endian
56 BOM_UTF32_BE = b'\x00\x00\xfe\xff'
57
58 if sys.byteorder == 'little':
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_LE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_LE
65
66 else:
67
68 # UTF-16, native endianness
69 BOM = BOM_UTF16 = BOM_UTF16_BE
70
71 # UTF-32, native endianness
72 BOM_UTF32 = BOM_UTF32_BE
73
74 # Old broken names (don't use in new code)
75 BOM32_LE = BOM_UTF16_LE
76 BOM32_BE = BOM_UTF16_BE
77 BOM64_LE = BOM_UTF32_LE
78 BOM64_BE = BOM_UTF32_BE
79
80
81 ### Codec base classes (defining the API)
82
83 class ESC[4;38;5;81mCodecInfo(ESC[4;38;5;149mtuple):
84 """Codec details when looking up the codec registry"""
85
86 # Private API to allow Python 3.4 to denylist the known non-Unicode
87 # codecs in the standard library. A more general mechanism to
88 # reliably distinguish test encodings from other codecs will hopefully
89 # be defined for Python 3.5
90 #
91 # See http://bugs.python.org/issue19619
92 _is_text_encoding = True # Assume codecs are text encodings by default
93
94 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
95 incrementalencoder=None, incrementaldecoder=None, name=None,
96 *, _is_text_encoding=None):
97 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98 self.name = name
99 self.encode = encode
100 self.decode = decode
101 self.incrementalencoder = incrementalencoder
102 self.incrementaldecoder = incrementaldecoder
103 self.streamwriter = streamwriter
104 self.streamreader = streamreader
105 if _is_text_encoding is not None:
106 self._is_text_encoding = _is_text_encoding
107 return self
108
109 def __repr__(self):
110 return "<%s.%s object for encoding %s at %#x>" % \
111 (self.__class__.__module__, self.__class__.__qualname__,
112 self.name, id(self))
113
114 class ESC[4;38;5;81mCodec:
115
116 """ Defines the interface for stateless encoders/decoders.
117
118 The .encode()/.decode() methods may use different error
119 handling schemes by providing the errors argument. These
120 string values are predefined:
121
122 'strict' - raise a ValueError error (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace' - replace with a suitable replacement character;
125 Python will use the official U+FFFD REPLACEMENT
126 CHARACTER for the builtin Unicode codecs on
127 decoding and '?' on encoding.
128 'surrogateescape' - replace with private code points U+DCnn.
129 'xmlcharrefreplace' - Replace with the appropriate XML
130 character reference (only for encoding).
131 'backslashreplace' - Replace with backslashed escape sequences.
132 'namereplace' - Replace with \\N{...} escape sequences
133 (only for encoding).
134
135 The set of allowed values can be extended via register_error.
136
137 """
138 def encode(self, input, errors='strict'):
139
140 """ Encodes the object input and returns a tuple (output
141 object, length consumed).
142
143 errors defines the error handling to apply. It defaults to
144 'strict' handling.
145
146 The method may not store state in the Codec instance. Use
147 StreamWriter for codecs which have to keep state in order to
148 make encoding efficient.
149
150 The encoder must be able to handle zero length input and
151 return an empty object of the output object type in this
152 situation.
153
154 """
155 raise NotImplementedError
156
157 def decode(self, input, errors='strict'):
158
159 """ Decodes the object input and returns a tuple (output
160 object, length consumed).
161
162 input must be an object which provides the bf_getreadbuf
163 buffer slot. Python strings, buffer objects and memory
164 mapped files are examples of objects providing this slot.
165
166 errors defines the error handling to apply. It defaults to
167 'strict' handling.
168
169 The method may not store state in the Codec instance. Use
170 StreamReader for codecs which have to keep state in order to
171 make decoding efficient.
172
173 The decoder must be able to handle zero length input and
174 return an empty object of the output object type in this
175 situation.
176
177 """
178 raise NotImplementedError
179
180 class ESC[4;38;5;81mIncrementalEncoder(ESC[4;38;5;149mobject):
181 """
182 An IncrementalEncoder encodes an input in multiple steps. The input can
183 be passed piece by piece to the encode() method. The IncrementalEncoder
184 remembers the state of the encoding process between calls to encode().
185 """
186 def __init__(self, errors='strict'):
187 """
188 Creates an IncrementalEncoder instance.
189
190 The IncrementalEncoder may use different error handling schemes by
191 providing the errors keyword argument. See the module docstring
192 for a list of possible values.
193 """
194 self.errors = errors
195 self.buffer = ""
196
197 def encode(self, input, final=False):
198 """
199 Encodes input and returns the resulting object.
200 """
201 raise NotImplementedError
202
203 def reset(self):
204 """
205 Resets the encoder to the initial state.
206 """
207
208 def getstate(self):
209 """
210 Return the current state of the encoder.
211 """
212 return 0
213
214 def setstate(self, state):
215 """
216 Set the current state of the encoder. state must have been
217 returned by getstate().
218 """
219
220 class ESC[4;38;5;81mBufferedIncrementalEncoder(ESC[4;38;5;149mIncrementalEncoder):
221 """
222 This subclass of IncrementalEncoder can be used as the baseclass for an
223 incremental encoder if the encoder must keep some of the output in a
224 buffer between calls to encode().
225 """
226 def __init__(self, errors='strict'):
227 IncrementalEncoder.__init__(self, errors)
228 # unencoded input that is kept between calls to encode()
229 self.buffer = ""
230
231 def _buffer_encode(self, input, errors, final):
232 # Overwrite this method in subclasses: It must encode input
233 # and return an (output, length consumed) tuple
234 raise NotImplementedError
235
236 def encode(self, input, final=False):
237 # encode input (taking the buffer into account)
238 data = self.buffer + input
239 (result, consumed) = self._buffer_encode(data, self.errors, final)
240 # keep unencoded input until the next call
241 self.buffer = data[consumed:]
242 return result
243
244 def reset(self):
245 IncrementalEncoder.reset(self)
246 self.buffer = ""
247
248 def getstate(self):
249 return self.buffer or 0
250
251 def setstate(self, state):
252 self.buffer = state or ""
253
254 class ESC[4;38;5;81mIncrementalDecoder(ESC[4;38;5;149mobject):
255 """
256 An IncrementalDecoder decodes an input in multiple steps. The input can
257 be passed piece by piece to the decode() method. The IncrementalDecoder
258 remembers the state of the decoding process between calls to decode().
259 """
260 def __init__(self, errors='strict'):
261 """
262 Create an IncrementalDecoder instance.
263
264 The IncrementalDecoder may use different error handling schemes by
265 providing the errors keyword argument. See the module docstring
266 for a list of possible values.
267 """
268 self.errors = errors
269
270 def decode(self, input, final=False):
271 """
272 Decode input and returns the resulting object.
273 """
274 raise NotImplementedError
275
276 def reset(self):
277 """
278 Reset the decoder to the initial state.
279 """
280
281 def getstate(self):
282 """
283 Return the current state of the decoder.
284
285 This must be a (buffered_input, additional_state_info) tuple.
286 buffered_input must be a bytes object containing bytes that
287 were passed to decode() that have not yet been converted.
288 additional_state_info must be a non-negative integer
289 representing the state of the decoder WITHOUT yet having
290 processed the contents of buffered_input. In the initial state
291 and after reset(), getstate() must return (b"", 0).
292 """
293 return (b"", 0)
294
295 def setstate(self, state):
296 """
297 Set the current state of the decoder.
298
299 state must have been returned by getstate(). The effect of
300 setstate((b"", 0)) must be equivalent to reset().
301 """
302
303 class ESC[4;38;5;81mBufferedIncrementalDecoder(ESC[4;38;5;149mIncrementalDecoder):
304 """
305 This subclass of IncrementalDecoder can be used as the baseclass for an
306 incremental decoder if the decoder must be able to handle incomplete
307 byte sequences.
308 """
309 def __init__(self, errors='strict'):
310 IncrementalDecoder.__init__(self, errors)
311 # undecoded input that is kept between calls to decode()
312 self.buffer = b""
313
314 def _buffer_decode(self, input, errors, final):
315 # Overwrite this method in subclasses: It must decode input
316 # and return an (output, length consumed) tuple
317 raise NotImplementedError
318
319 def decode(self, input, final=False):
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
325 return result
326
327 def reset(self):
328 IncrementalDecoder.reset(self)
329 self.buffer = b""
330
331 def getstate(self):
332 # additional state info is always 0
333 return (self.buffer, 0)
334
335 def setstate(self, state):
336 # ignore additional state info
337 self.buffer = state[0]
338
339 #
340 # The StreamWriter and StreamReader class provide generic working
341 # interfaces which can be used to implement new encoding submodules
342 # very easily. See encodings/utf_8.py for an example on how this is
343 # done.
344 #
345
346 class ESC[4;38;5;81mStreamWriter(ESC[4;38;5;149mCodec):
347
348 def __init__(self, stream, errors='strict'):
349
350 """ Creates a StreamWriter instance.
351
352 stream must be a file-like object open for writing.
353
354 The StreamWriter may use different error handling
355 schemes by providing the errors keyword argument. These
356 parameters are predefined:
357
358 'strict' - raise a ValueError (or a subclass)
359 'ignore' - ignore the character and continue with the next
360 'replace'- replace with a suitable replacement character
361 'xmlcharrefreplace' - Replace with the appropriate XML
362 character reference.
363 'backslashreplace' - Replace with backslashed escape
364 sequences.
365 'namereplace' - Replace with \\N{...} escape sequences.
366
367 The set of allowed parameter values can be extended via
368 register_error.
369 """
370 self.stream = stream
371 self.errors = errors
372
373 def write(self, object):
374
375 """ Writes the object's contents encoded to self.stream.
376 """
377 data, consumed = self.encode(object, self.errors)
378 self.stream.write(data)
379
380 def writelines(self, list):
381
382 """ Writes the concatenated list of strings to the stream
383 using .write().
384 """
385 self.write(''.join(list))
386
387 def reset(self):
388
389 """ Resets the codec buffers used for keeping internal state.
390
391 Calling this method should ensure that the data on the
392 output is put into a clean state, that allows appending
393 of new fresh data without having to rescan the whole
394 stream to recover state.
395
396 """
397 pass
398
399 def seek(self, offset, whence=0):
400 self.stream.seek(offset, whence)
401 if whence == 0 and offset == 0:
402 self.reset()
403
404 def __getattr__(self, name,
405 getattr=getattr):
406
407 """ Inherit all other methods from the underlying stream.
408 """
409 return getattr(self.stream, name)
410
411 def __enter__(self):
412 return self
413
414 def __exit__(self, type, value, tb):
415 self.stream.close()
416
417 def __reduce_ex__(self, proto):
418 raise TypeError("can't serialize %s" % self.__class__.__name__)
419
420 ###
421
422 class ESC[4;38;5;81mStreamReader(ESC[4;38;5;149mCodec):
423
424 charbuffertype = str
425
426 def __init__(self, stream, errors='strict'):
427
428 """ Creates a StreamReader instance.
429
430 stream must be a file-like object open for reading.
431
432 The StreamReader may use different error handling
433 schemes by providing the errors keyword argument. These
434 parameters are predefined:
435
436 'strict' - raise a ValueError (or a subclass)
437 'ignore' - ignore the character and continue with the next
438 'replace'- replace with a suitable replacement character
439 'backslashreplace' - Replace with backslashed escape sequences;
440
441 The set of allowed parameter values can be extended via
442 register_error.
443 """
444 self.stream = stream
445 self.errors = errors
446 self.bytebuffer = b""
447 self._empty_charbuffer = self.charbuffertype()
448 self.charbuffer = self._empty_charbuffer
449 self.linebuffer = None
450
451 def decode(self, input, errors='strict'):
452 raise NotImplementedError
453
454 def read(self, size=-1, chars=-1, firstline=False):
455
456 """ Decodes data from the stream self.stream and returns the
457 resulting object.
458
459 chars indicates the number of decoded code points or bytes to
460 return. read() will never return more data than requested,
461 but it might return less, if there is not enough available.
462
463 size indicates the approximate maximum number of decoded
464 bytes or code points to read for decoding. The decoder
465 can modify this setting as appropriate. The default value
466 -1 indicates to read and decode as much as possible. size
467 is intended to prevent having to decode huge files in one
468 step.
469
470 If firstline is true, and a UnicodeDecodeError happens
471 after the first line terminator in the input only the first line
472 will be returned, the rest of the input will be kept until the
473 next call to read().
474
475 The method should use a greedy read strategy, meaning that
476 it should read as much data as is allowed within the
477 definition of the encoding and the given size, e.g. if
478 optional encoding endings or state markers are available
479 on the stream, these should be read too.
480 """
481 # If we have lines cached, first merge them back into characters
482 if self.linebuffer:
483 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
484 self.linebuffer = None
485
486 if chars < 0:
487 # For compatibility with other read() methods that take a
488 # single argument
489 chars = size
490
491 # read until we get the required number of characters (if available)
492 while True:
493 # can the request be satisfied from the character buffer?
494 if chars >= 0:
495 if len(self.charbuffer) >= chars:
496 break
497 # we need more data
498 if size < 0:
499 newdata = self.stream.read()
500 else:
501 newdata = self.stream.read(size)
502 # decode bytes (those remaining from the last call included)
503 data = self.bytebuffer + newdata
504 if not data:
505 break
506 try:
507 newchars, decodedbytes = self.decode(data, self.errors)
508 except UnicodeDecodeError as exc:
509 if firstline:
510 newchars, decodedbytes = \
511 self.decode(data[:exc.start], self.errors)
512 lines = newchars.splitlines(keepends=True)
513 if len(lines)<=1:
514 raise
515 else:
516 raise
517 # keep undecoded bytes until the next call
518 self.bytebuffer = data[decodedbytes:]
519 # put new characters in the character buffer
520 self.charbuffer += newchars
521 # there was no data available
522 if not newdata:
523 break
524 if chars < 0:
525 # Return everything we've got
526 result = self.charbuffer
527 self.charbuffer = self._empty_charbuffer
528 else:
529 # Return the first chars characters
530 result = self.charbuffer[:chars]
531 self.charbuffer = self.charbuffer[chars:]
532 return result
533
534 def readline(self, size=None, keepends=True):
535
536 """ Read one line from the input stream and return the
537 decoded data.
538
539 size, if given, is passed as size argument to the
540 read() method.
541
542 """
543 # If we have lines cached from an earlier read, return
544 # them unconditionally
545 if self.linebuffer:
546 line = self.linebuffer[0]
547 del self.linebuffer[0]
548 if len(self.linebuffer) == 1:
549 # revert to charbuffer mode; we might need more data
550 # next time
551 self.charbuffer = self.linebuffer[0]
552 self.linebuffer = None
553 if not keepends:
554 line = line.splitlines(keepends=False)[0]
555 return line
556
557 readsize = size or 72
558 line = self._empty_charbuffer
559 # If size is given, we call read() only once
560 while True:
561 data = self.read(readsize, firstline=True)
562 if data:
563 # If we're at a "\r" read one extra character (which might
564 # be a "\n") to get a proper line ending. If the stream is
565 # temporarily exhausted we return the wrong line ending.
566 if (isinstance(data, str) and data.endswith("\r")) or \
567 (isinstance(data, bytes) and data.endswith(b"\r")):
568 data += self.read(size=1, chars=1)
569
570 line += data
571 lines = line.splitlines(keepends=True)
572 if lines:
573 if len(lines) > 1:
574 # More than one line result; the first line is a full line
575 # to return
576 line = lines[0]
577 del lines[0]
578 if len(lines) > 1:
579 # cache the remaining lines
580 lines[-1] += self.charbuffer
581 self.linebuffer = lines
582 self.charbuffer = None
583 else:
584 # only one remaining line, put it back into charbuffer
585 self.charbuffer = lines[0] + self.charbuffer
586 if not keepends:
587 line = line.splitlines(keepends=False)[0]
588 break
589 line0withend = lines[0]
590 line0withoutend = lines[0].splitlines(keepends=False)[0]
591 if line0withend != line0withoutend: # We really have a line end
592 # Put the rest back together and keep it until the next call
593 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
594 self.charbuffer
595 if keepends:
596 line = line0withend
597 else:
598 line = line0withoutend
599 break
600 # we didn't get anything or this was our only try
601 if not data or size is not None:
602 if line and not keepends:
603 line = line.splitlines(keepends=False)[0]
604 break
605 if readsize < 8000:
606 readsize *= 2
607 return line
608
609 def readlines(self, sizehint=None, keepends=True):
610
611 """ Read all lines available on the input stream
612 and return them as a list.
613
614 Line breaks are implemented using the codec's decoder
615 method and are included in the list entries.
616
617 sizehint, if given, is ignored since there is no efficient
618 way to finding the true end-of-line.
619
620 """
621 data = self.read()
622 return data.splitlines(keepends)
623
624 def reset(self):
625
626 """ Resets the codec buffers used for keeping internal state.
627
628 Note that no stream repositioning should take place.
629 This method is primarily intended to be able to recover
630 from decoding errors.
631
632 """
633 self.bytebuffer = b""
634 self.charbuffer = self._empty_charbuffer
635 self.linebuffer = None
636
637 def seek(self, offset, whence=0):
638 """ Set the input stream's current position.
639
640 Resets the codec buffers used for keeping state.
641 """
642 self.stream.seek(offset, whence)
643 self.reset()
644
645 def __next__(self):
646
647 """ Return the next decoded line from the input stream."""
648 line = self.readline()
649 if line:
650 return line
651 raise StopIteration
652
653 def __iter__(self):
654 return self
655
656 def __getattr__(self, name,
657 getattr=getattr):
658
659 """ Inherit all other methods from the underlying stream.
660 """
661 return getattr(self.stream, name)
662
663 def __enter__(self):
664 return self
665
666 def __exit__(self, type, value, tb):
667 self.stream.close()
668
669 def __reduce_ex__(self, proto):
670 raise TypeError("can't serialize %s" % self.__class__.__name__)
671
672 ###
673
674 class ESC[4;38;5;81mStreamReaderWriter:
675
676 """ StreamReaderWriter instances allow wrapping streams which
677 work in both read and write modes.
678
679 The design is such that one can use the factory functions
680 returned by the codec.lookup() function to construct the
681 instance.
682
683 """
684 # Optional attributes set by the file wrappers below
685 encoding = 'unknown'
686
687 def __init__(self, stream, Reader, Writer, errors='strict'):
688
689 """ Creates a StreamReaderWriter instance.
690
691 stream must be a Stream-like object.
692
693 Reader, Writer must be factory functions or classes
694 providing the StreamReader, StreamWriter interface resp.
695
696 Error handling is done in the same way as defined for the
697 StreamWriter/Readers.
698
699 """
700 self.stream = stream
701 self.reader = Reader(stream, errors)
702 self.writer = Writer(stream, errors)
703 self.errors = errors
704
705 def read(self, size=-1):
706
707 return self.reader.read(size)
708
709 def readline(self, size=None):
710
711 return self.reader.readline(size)
712
713 def readlines(self, sizehint=None):
714
715 return self.reader.readlines(sizehint)
716
717 def __next__(self):
718
719 """ Return the next decoded line from the input stream."""
720 return next(self.reader)
721
722 def __iter__(self):
723 return self
724
725 def write(self, data):
726
727 return self.writer.write(data)
728
729 def writelines(self, list):
730
731 return self.writer.writelines(list)
732
733 def reset(self):
734
735 self.reader.reset()
736 self.writer.reset()
737
738 def seek(self, offset, whence=0):
739 self.stream.seek(offset, whence)
740 self.reader.reset()
741 if whence == 0 and offset == 0:
742 self.writer.reset()
743
744 def __getattr__(self, name,
745 getattr=getattr):
746
747 """ Inherit all other methods from the underlying stream.
748 """
749 return getattr(self.stream, name)
750
751 # these are needed to make "with StreamReaderWriter(...)" work properly
752
753 def __enter__(self):
754 return self
755
756 def __exit__(self, type, value, tb):
757 self.stream.close()
758
759 def __reduce_ex__(self, proto):
760 raise TypeError("can't serialize %s" % self.__class__.__name__)
761
762 ###
763
764 class ESC[4;38;5;81mStreamRecoder:
765
766 """ StreamRecoder instances translate data from one encoding to another.
767
768 They use the complete set of APIs returned by the
769 codecs.lookup() function to implement their task.
770
771 Data written to the StreamRecoder is first decoded into an
772 intermediate format (depending on the "decode" codec) and then
773 written to the underlying stream using an instance of the provided
774 Writer class.
775
776 In the other direction, data is read from the underlying stream using
777 a Reader instance and then encoded and returned to the caller.
778
779 """
780 # Optional attributes set by the file wrappers below
781 data_encoding = 'unknown'
782 file_encoding = 'unknown'
783
784 def __init__(self, stream, encode, decode, Reader, Writer,
785 errors='strict'):
786
787 """ Creates a StreamRecoder instance which implements a two-way
788 conversion: encode and decode work on the frontend (the
789 data visible to .read() and .write()) while Reader and Writer
790 work on the backend (the data in stream).
791
792 You can use these objects to do transparent
793 transcodings from e.g. latin-1 to utf-8 and back.
794
795 stream must be a file-like object.
796
797 encode and decode must adhere to the Codec interface; Reader and
798 Writer must be factory functions or classes providing the
799 StreamReader and StreamWriter interfaces resp.
800
801 Error handling is done in the same way as defined for the
802 StreamWriter/Readers.
803
804 """
805 self.stream = stream
806 self.encode = encode
807 self.decode = decode
808 self.reader = Reader(stream, errors)
809 self.writer = Writer(stream, errors)
810 self.errors = errors
811
812 def read(self, size=-1):
813
814 data = self.reader.read(size)
815 data, bytesencoded = self.encode(data, self.errors)
816 return data
817
818 def readline(self, size=None):
819
820 if size is None:
821 data = self.reader.readline()
822 else:
823 data = self.reader.readline(size)
824 data, bytesencoded = self.encode(data, self.errors)
825 return data
826
827 def readlines(self, sizehint=None):
828
829 data = self.reader.read()
830 data, bytesencoded = self.encode(data, self.errors)
831 return data.splitlines(keepends=True)
832
833 def __next__(self):
834
835 """ Return the next decoded line from the input stream."""
836 data = next(self.reader)
837 data, bytesencoded = self.encode(data, self.errors)
838 return data
839
840 def __iter__(self):
841 return self
842
843 def write(self, data):
844
845 data, bytesdecoded = self.decode(data, self.errors)
846 return self.writer.write(data)
847
848 def writelines(self, list):
849
850 data = b''.join(list)
851 data, bytesdecoded = self.decode(data, self.errors)
852 return self.writer.write(data)
853
854 def reset(self):
855
856 self.reader.reset()
857 self.writer.reset()
858
859 def seek(self, offset, whence=0):
860 # Seeks must be propagated to both the readers and writers
861 # as they might need to reset their internal buffers.
862 self.reader.seek(offset, whence)
863 self.writer.seek(offset, whence)
864
865 def __getattr__(self, name,
866 getattr=getattr):
867
868 """ Inherit all other methods from the underlying stream.
869 """
870 return getattr(self.stream, name)
871
872 def __enter__(self):
873 return self
874
875 def __exit__(self, type, value, tb):
876 self.stream.close()
877
878 def __reduce_ex__(self, proto):
879 raise TypeError("can't serialize %s" % self.__class__.__name__)
880
881 ### Shortcuts
882
883 def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
884
885 """ Open an encoded file using the given mode and return
886 a wrapped version providing transparent encoding/decoding.
887
888 Note: The wrapped version will only accept the object format
889 defined by the codecs, i.e. Unicode objects for most builtin
890 codecs. Output is also codec dependent and will usually be
891 Unicode as well.
892
893 If encoding is not None, then the
894 underlying encoded files are always opened in binary mode.
895 The default file mode is 'r', meaning to open the file in read mode.
896
897 encoding specifies the encoding which is to be used for the
898 file.
899
900 errors may be given to define the error handling. It defaults
901 to 'strict' which causes ValueErrors to be raised in case an
902 encoding error occurs.
903
904 buffering has the same meaning as for the builtin open() API.
905 It defaults to -1 which means that the default buffer size will
906 be used.
907
908 The returned wrapped file object provides an extra attribute
909 .encoding which allows querying the used encoding. This
910 attribute is only available if an encoding was specified as
911 parameter.
912
913 """
914 if encoding is not None and \
915 'b' not in mode:
916 # Force opening of the file in binary mode
917 mode = mode + 'b'
918 file = builtins.open(filename, mode, buffering)
919 if encoding is None:
920 return file
921
922 try:
923 info = lookup(encoding)
924 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
925 # Add attributes to simplify introspection
926 srw.encoding = encoding
927 return srw
928 except:
929 file.close()
930 raise
931
932 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
933
934 """ Return a wrapped version of file which provides transparent
935 encoding translation.
936
937 Data written to the wrapped file is decoded according
938 to the given data_encoding and then encoded to the underlying
939 file using file_encoding. The intermediate data type
940 will usually be Unicode but depends on the specified codecs.
941
942 Bytes read from the file are decoded using file_encoding and then
943 passed back to the caller encoded using data_encoding.
944
945 If file_encoding is not given, it defaults to data_encoding.
946
947 errors may be given to define the error handling. It defaults
948 to 'strict' which causes ValueErrors to be raised in case an
949 encoding error occurs.
950
951 The returned wrapped file object provides two extra attributes
952 .data_encoding and .file_encoding which reflect the given
953 parameters of the same name. The attributes can be used for
954 introspection by Python programs.
955
956 """
957 if file_encoding is None:
958 file_encoding = data_encoding
959 data_info = lookup(data_encoding)
960 file_info = lookup(file_encoding)
961 sr = StreamRecoder(file, data_info.encode, data_info.decode,
962 file_info.streamreader, file_info.streamwriter, errors)
963 # Add attributes to simplify introspection
964 sr.data_encoding = data_encoding
965 sr.file_encoding = file_encoding
966 return sr
967
968 ### Helpers for codec lookup
969
970 def getencoder(encoding):
971
972 """ Lookup up the codec for the given encoding and return
973 its encoder function.
974
975 Raises a LookupError in case the encoding cannot be found.
976
977 """
978 return lookup(encoding).encode
979
980 def getdecoder(encoding):
981
982 """ Lookup up the codec for the given encoding and return
983 its decoder function.
984
985 Raises a LookupError in case the encoding cannot be found.
986
987 """
988 return lookup(encoding).decode
989
990 def getincrementalencoder(encoding):
991
992 """ Lookup up the codec for the given encoding and return
993 its IncrementalEncoder class or factory function.
994
995 Raises a LookupError in case the encoding cannot be found
996 or the codecs doesn't provide an incremental encoder.
997
998 """
999 encoder = lookup(encoding).incrementalencoder
1000 if encoder is None:
1001 raise LookupError(encoding)
1002 return encoder
1003
1004 def getincrementaldecoder(encoding):
1005
1006 """ Lookup up the codec for the given encoding and return
1007 its IncrementalDecoder class or factory function.
1008
1009 Raises a LookupError in case the encoding cannot be found
1010 or the codecs doesn't provide an incremental decoder.
1011
1012 """
1013 decoder = lookup(encoding).incrementaldecoder
1014 if decoder is None:
1015 raise LookupError(encoding)
1016 return decoder
1017
1018 def getreader(encoding):
1019
1020 """ Lookup up the codec for the given encoding and return
1021 its StreamReader class or factory function.
1022
1023 Raises a LookupError in case the encoding cannot be found.
1024
1025 """
1026 return lookup(encoding).streamreader
1027
1028 def getwriter(encoding):
1029
1030 """ Lookup up the codec for the given encoding and return
1031 its StreamWriter class or factory function.
1032
1033 Raises a LookupError in case the encoding cannot be found.
1034
1035 """
1036 return lookup(encoding).streamwriter
1037
1038 def iterencode(iterator, encoding, errors='strict', **kwargs):
1039 """
1040 Encoding iterator.
1041
1042 Encodes the input strings from the iterator using an IncrementalEncoder.
1043
1044 errors and kwargs are passed through to the IncrementalEncoder
1045 constructor.
1046 """
1047 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1048 for input in iterator:
1049 output = encoder.encode(input)
1050 if output:
1051 yield output
1052 output = encoder.encode("", True)
1053 if output:
1054 yield output
1055
1056 def iterdecode(iterator, encoding, errors='strict', **kwargs):
1057 """
1058 Decoding iterator.
1059
1060 Decodes the input strings from the iterator using an IncrementalDecoder.
1061
1062 errors and kwargs are passed through to the IncrementalDecoder
1063 constructor.
1064 """
1065 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1066 for input in iterator:
1067 output = decoder.decode(input)
1068 if output:
1069 yield output
1070 output = decoder.decode(b"", True)
1071 if output:
1072 yield output
1073
1074 ### Helpers for charmap-based codecs
1075
1076 def make_identity_dict(rng):
1077
1078 """ make_identity_dict(rng) -> dict
1079
1080 Return a dictionary where elements of the rng sequence are
1081 mapped to themselves.
1082
1083 """
1084 return {i:i for i in rng}
1085
1086 def make_encoding_map(decoding_map):
1087
1088 """ Creates an encoding map from a decoding map.
1089
1090 If a target mapping in the decoding map occurs multiple
1091 times, then that target is mapped to None (undefined mapping),
1092 causing an exception when encountered by the charmap codec
1093 during translation.
1094
1095 One example where this happens is cp875.py which decodes
1096 multiple character to \\u001a.
1097
1098 """
1099 m = {}
1100 for k,v in decoding_map.items():
1101 if not v in m:
1102 m[v] = k
1103 else:
1104 m[v] = None
1105 return m
1106
1107 ### error handlers
1108
1109 try:
1110 strict_errors = lookup_error("strict")
1111 ignore_errors = lookup_error("ignore")
1112 replace_errors = lookup_error("replace")
1113 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1114 backslashreplace_errors = lookup_error("backslashreplace")
1115 namereplace_errors = lookup_error("namereplace")
1116 except LookupError:
1117 # In --disable-unicode builds, these error handler are missing
1118 strict_errors = None
1119 ignore_errors = None
1120 replace_errors = None
1121 xmlcharrefreplace_errors = None
1122 backslashreplace_errors = None
1123 namereplace_errors = None
1124
1125 # Tell modulefinder that using codecs probably needs the encodings
1126 # package
1127 _false = 0
1128 if _false:
1129 import encodings
1130
1131 ### Tests
1132
1133 if __name__ == '__main__':
1134
1135 # Make stdout translate Latin-1 output into UTF-8 output
1136 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1137
1138 # Have stdin translate Latin-1 input into UTF-8 input
1139 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')