1 """Functions that read and write gzipped files.
2
3 The user of the file doesn't have to worry about the compression,
4 but random access is not allowed."""
5
6 # based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8 import struct, sys, time, os
9 import zlib
10 import builtins
11 import io
12 import _compression
13
14 __all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
15
16 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18 READ, WRITE = 1, 2
19
20 _COMPRESS_LEVEL_FAST = 1
21 _COMPRESS_LEVEL_TRADEOFF = 6
22 _COMPRESS_LEVEL_BEST = 9
23
24
25 def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
26 encoding=None, errors=None, newline=None):
27 """Open a gzip-compressed file in binary or text mode.
28
29 The filename argument can be an actual filename (a str or bytes object), or
30 an existing file object to read from or write to.
31
32 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34 "rb", and the default compresslevel is 9.
35
36 For binary mode, this function is equivalent to the GzipFile constructor:
37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38 and newline arguments must not be provided.
39
40 For text mode, a GzipFile object is created, and wrapped in an
41 io.TextIOWrapper instance with the specified encoding, error handling
42 behavior, and line ending(s).
43
44 """
45 if "t" in mode:
46 if "b" in mode:
47 raise ValueError("Invalid mode: %r" % (mode,))
48 else:
49 if encoding is not None:
50 raise ValueError("Argument 'encoding' not supported in binary mode")
51 if errors is not None:
52 raise ValueError("Argument 'errors' not supported in binary mode")
53 if newline is not None:
54 raise ValueError("Argument 'newline' not supported in binary mode")
55
56 gz_mode = mode.replace("t", "")
57 if isinstance(filename, (str, bytes, os.PathLike)):
58 binary_file = GzipFile(filename, gz_mode, compresslevel)
59 elif hasattr(filename, "read") or hasattr(filename, "write"):
60 binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61 else:
62 raise TypeError("filename must be a str or bytes object, or a file")
63
64 if "t" in mode:
65 encoding = io.text_encoding(encoding)
66 return io.TextIOWrapper(binary_file, encoding, errors, newline)
67 else:
68 return binary_file
69
70 def write32u(output, value):
71 # The L format writes the bit pattern correctly whether signed
72 # or unsigned.
73 output.write(struct.pack("<L", value))
74
75 class ESC[4;38;5;81m_PaddedFile:
76 """Minimal read-only file object that prepends a string to the contents
77 of an actual file. Shouldn't be used outside of gzip.py, as it lacks
78 essential functionality."""
79
80 def __init__(self, f, prepend=b''):
81 self._buffer = prepend
82 self._length = len(prepend)
83 self.file = f
84 self._read = 0
85
86 def read(self, size):
87 if self._read is None:
88 return self.file.read(size)
89 if self._read + size <= self._length:
90 read = self._read
91 self._read += size
92 return self._buffer[read:self._read]
93 else:
94 read = self._read
95 self._read = None
96 return self._buffer[read:] + \
97 self.file.read(size-self._length+read)
98
99 def prepend(self, prepend=b''):
100 if self._read is None:
101 self._buffer = prepend
102 else: # Assume data was read since the last prepend() call
103 self._read -= len(prepend)
104 return
105 self._length = len(self._buffer)
106 self._read = 0
107
108 def seek(self, off):
109 self._read = None
110 self._buffer = None
111 return self.file.seek(off)
112
113 def seekable(self):
114 return True # Allows fast-forwarding even in unseekable streams
115
116
117 class ESC[4;38;5;81mBadGzipFile(ESC[4;38;5;149mOSError):
118 """Exception raised in some cases for invalid gzip files."""
119
120
121 class ESC[4;38;5;81mGzipFile(ESC[4;38;5;149m_compressionESC[4;38;5;149m.ESC[4;38;5;149mBaseStream):
122 """The GzipFile class simulates most of the methods of a file object with
123 the exception of the truncate() method.
124
125 This class only supports opening files in binary mode. If you need to open a
126 compressed file in text mode, use the gzip.open() function.
127
128 """
129
130 # Overridden with internal file object to be closed, if only a filename
131 # is passed in
132 myfileobj = None
133
134 def __init__(self, filename=None, mode=None,
135 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
136 """Constructor for the GzipFile class.
137
138 At least one of fileobj and filename must be given a
139 non-trivial value.
140
141 The new class instance is based on fileobj, which can be a regular
142 file, an io.BytesIO object, or any other object which simulates a file.
143 It defaults to None, in which case filename is opened to provide
144 a file object.
145
146 When fileobj is not None, the filename argument is only used to be
147 included in the gzip file header, which may include the original
148 filename of the uncompressed file. It defaults to the filename of
149 fileobj, if discernible; otherwise, it defaults to the empty string,
150 and in this case the original filename is not included in the header.
151
152 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
153 'xb' depending on whether the file will be read or written. The default
154 is the mode of fileobj if discernible; otherwise, the default is 'rb'.
155 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
156 'wb', 'a' and 'ab', and 'x' and 'xb'.
157
158 The compresslevel argument is an integer from 0 to 9 controlling the
159 level of compression; 1 is fastest and produces the least compression,
160 and 9 is slowest and produces the most compression. 0 is no compression
161 at all. The default is 9.
162
163 The mtime argument is an optional numeric timestamp to be written
164 to the last modification time field in the stream when compressing.
165 If omitted or None, the current time is used.
166
167 """
168
169 if mode and ('t' in mode or 'U' in mode):
170 raise ValueError("Invalid mode: {!r}".format(mode))
171 if mode and 'b' not in mode:
172 mode += 'b'
173 if fileobj is None:
174 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
175 if filename is None:
176 filename = getattr(fileobj, 'name', '')
177 if not isinstance(filename, (str, bytes)):
178 filename = ''
179 else:
180 filename = os.fspath(filename)
181 origmode = mode
182 if mode is None:
183 mode = getattr(fileobj, 'mode', 'rb')
184
185 if mode.startswith('r'):
186 self.mode = READ
187 raw = _GzipReader(fileobj)
188 self._buffer = io.BufferedReader(raw)
189 self.name = filename
190
191 elif mode.startswith(('w', 'a', 'x')):
192 if origmode is None:
193 import warnings
194 warnings.warn(
195 "GzipFile was opened for writing, but this will "
196 "change in future Python releases. "
197 "Specify the mode argument for opening it for writing.",
198 FutureWarning, 2)
199 self.mode = WRITE
200 self._init_write(filename)
201 self.compress = zlib.compressobj(compresslevel,
202 zlib.DEFLATED,
203 -zlib.MAX_WBITS,
204 zlib.DEF_MEM_LEVEL,
205 0)
206 self._write_mtime = mtime
207 else:
208 raise ValueError("Invalid mode: {!r}".format(mode))
209
210 self.fileobj = fileobj
211
212 if self.mode == WRITE:
213 self._write_gzip_header(compresslevel)
214
215 @property
216 def filename(self):
217 import warnings
218 warnings.warn("use the name attribute", DeprecationWarning, 2)
219 if self.mode == WRITE and self.name[-3:] != ".gz":
220 return self.name + ".gz"
221 return self.name
222
223 @property
224 def mtime(self):
225 """Last modification time read from stream, or None"""
226 return self._buffer.raw._last_mtime
227
228 def __repr__(self):
229 s = repr(self.fileobj)
230 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
231
232 def _init_write(self, filename):
233 self.name = filename
234 self.crc = zlib.crc32(b"")
235 self.size = 0
236 self.writebuf = []
237 self.bufsize = 0
238 self.offset = 0 # Current file offset for seek(), tell(), etc
239
240 def _write_gzip_header(self, compresslevel):
241 self.fileobj.write(b'\037\213') # magic header
242 self.fileobj.write(b'\010') # compression method
243 try:
244 # RFC 1952 requires the FNAME field to be Latin-1. Do not
245 # include filenames that cannot be represented that way.
246 fname = os.path.basename(self.name)
247 if not isinstance(fname, bytes):
248 fname = fname.encode('latin-1')
249 if fname.endswith(b'.gz'):
250 fname = fname[:-3]
251 except UnicodeEncodeError:
252 fname = b''
253 flags = 0
254 if fname:
255 flags = FNAME
256 self.fileobj.write(chr(flags).encode('latin-1'))
257 mtime = self._write_mtime
258 if mtime is None:
259 mtime = time.time()
260 write32u(self.fileobj, int(mtime))
261 if compresslevel == _COMPRESS_LEVEL_BEST:
262 xfl = b'\002'
263 elif compresslevel == _COMPRESS_LEVEL_FAST:
264 xfl = b'\004'
265 else:
266 xfl = b'\000'
267 self.fileobj.write(xfl)
268 self.fileobj.write(b'\377')
269 if fname:
270 self.fileobj.write(fname + b'\000')
271
272 def write(self,data):
273 self._check_not_closed()
274 if self.mode != WRITE:
275 import errno
276 raise OSError(errno.EBADF, "write() on read-only GzipFile object")
277
278 if self.fileobj is None:
279 raise ValueError("write() on closed GzipFile object")
280
281 if isinstance(data, (bytes, bytearray)):
282 length = len(data)
283 else:
284 # accept any data that supports the buffer protocol
285 data = memoryview(data)
286 length = data.nbytes
287
288 if length > 0:
289 self.fileobj.write(self.compress.compress(data))
290 self.size += length
291 self.crc = zlib.crc32(data, self.crc)
292 self.offset += length
293
294 return length
295
296 def read(self, size=-1):
297 self._check_not_closed()
298 if self.mode != READ:
299 import errno
300 raise OSError(errno.EBADF, "read() on write-only GzipFile object")
301 return self._buffer.read(size)
302
303 def read1(self, size=-1):
304 """Implements BufferedIOBase.read1()
305
306 Reads up to a buffer's worth of data if size is negative."""
307 self._check_not_closed()
308 if self.mode != READ:
309 import errno
310 raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
311
312 if size < 0:
313 size = io.DEFAULT_BUFFER_SIZE
314 return self._buffer.read1(size)
315
316 def peek(self, n):
317 self._check_not_closed()
318 if self.mode != READ:
319 import errno
320 raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
321 return self._buffer.peek(n)
322
323 @property
324 def closed(self):
325 return self.fileobj is None
326
327 def close(self):
328 fileobj = self.fileobj
329 if fileobj is None:
330 return
331 self.fileobj = None
332 try:
333 if self.mode == WRITE:
334 fileobj.write(self.compress.flush())
335 write32u(fileobj, self.crc)
336 # self.size may exceed 2 GiB, or even 4 GiB
337 write32u(fileobj, self.size & 0xffffffff)
338 elif self.mode == READ:
339 self._buffer.close()
340 finally:
341 myfileobj = self.myfileobj
342 if myfileobj:
343 self.myfileobj = None
344 myfileobj.close()
345
346 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
347 self._check_not_closed()
348 if self.mode == WRITE:
349 # Ensure the compressor's buffer is flushed
350 self.fileobj.write(self.compress.flush(zlib_mode))
351 self.fileobj.flush()
352
353 def fileno(self):
354 """Invoke the underlying file object's fileno() method.
355
356 This will raise AttributeError if the underlying file object
357 doesn't support fileno().
358 """
359 return self.fileobj.fileno()
360
361 def rewind(self):
362 '''Return the uncompressed stream file position indicator to the
363 beginning of the file'''
364 if self.mode != READ:
365 raise OSError("Can't rewind in write mode")
366 self._buffer.seek(0)
367
368 def readable(self):
369 return self.mode == READ
370
371 def writable(self):
372 return self.mode == WRITE
373
374 def seekable(self):
375 return True
376
377 def seek(self, offset, whence=io.SEEK_SET):
378 if self.mode == WRITE:
379 if whence != io.SEEK_SET:
380 if whence == io.SEEK_CUR:
381 offset = self.offset + offset
382 else:
383 raise ValueError('Seek from end not supported')
384 if offset < self.offset:
385 raise OSError('Negative seek in write mode')
386 count = offset - self.offset
387 chunk = b'\0' * 1024
388 for i in range(count // 1024):
389 self.write(chunk)
390 self.write(b'\0' * (count % 1024))
391 elif self.mode == READ:
392 self._check_not_closed()
393 return self._buffer.seek(offset, whence)
394
395 return self.offset
396
397 def readline(self, size=-1):
398 self._check_not_closed()
399 return self._buffer.readline(size)
400
401
402 def _read_exact(fp, n):
403 '''Read exactly *n* bytes from `fp`
404
405 This method is required because fp may be unbuffered,
406 i.e. return short reads.
407 '''
408 data = fp.read(n)
409 while len(data) < n:
410 b = fp.read(n - len(data))
411 if not b:
412 raise EOFError("Compressed file ended before the "
413 "end-of-stream marker was reached")
414 data += b
415 return data
416
417
418 def _read_gzip_header(fp):
419 '''Read a gzip header from `fp` and progress to the end of the header.
420
421 Returns last mtime if header was present or None otherwise.
422 '''
423 magic = fp.read(2)
424 if magic == b'':
425 return None
426
427 if magic != b'\037\213':
428 raise BadGzipFile('Not a gzipped file (%r)' % magic)
429
430 (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
431 if method != 8:
432 raise BadGzipFile('Unknown compression method')
433
434 if flag & FEXTRA:
435 # Read & discard the extra field, if present
436 extra_len, = struct.unpack("<H", _read_exact(fp, 2))
437 _read_exact(fp, extra_len)
438 if flag & FNAME:
439 # Read and discard a null-terminated string containing the filename
440 while True:
441 s = fp.read(1)
442 if not s or s==b'\000':
443 break
444 if flag & FCOMMENT:
445 # Read and discard a null-terminated string containing a comment
446 while True:
447 s = fp.read(1)
448 if not s or s==b'\000':
449 break
450 if flag & FHCRC:
451 _read_exact(fp, 2) # Read & discard the 16-bit header CRC
452 return last_mtime
453
454
455 class ESC[4;38;5;81m_GzipReader(ESC[4;38;5;149m_compressionESC[4;38;5;149m.ESC[4;38;5;149mDecompressReader):
456 def __init__(self, fp):
457 super().__init__(_PaddedFile(fp), zlib.decompressobj,
458 wbits=-zlib.MAX_WBITS)
459 # Set flag indicating start of a new member
460 self._new_member = True
461 self._last_mtime = None
462
463 def _init_read(self):
464 self._crc = zlib.crc32(b"")
465 self._stream_size = 0 # Decompressed size of unconcatenated stream
466
467 def _read_gzip_header(self):
468 last_mtime = _read_gzip_header(self._fp)
469 if last_mtime is None:
470 return False
471 self._last_mtime = last_mtime
472 return True
473
474 def read(self, size=-1):
475 if size < 0:
476 return self.readall()
477 # size=0 is special because decompress(max_length=0) is not supported
478 if not size:
479 return b""
480
481 # For certain input data, a single
482 # call to decompress() may not return
483 # any data. In this case, retry until we get some data or reach EOF.
484 while True:
485 if self._decompressor.eof:
486 # Ending case: we've come to the end of a member in the file,
487 # so finish up this member, and read a new gzip header.
488 # Check the CRC and file size, and set the flag so we read
489 # a new member
490 self._read_eof()
491 self._new_member = True
492 self._decompressor = self._decomp_factory(
493 **self._decomp_args)
494
495 if self._new_member:
496 # If the _new_member flag is set, we have to
497 # jump to the next member, if there is one.
498 self._init_read()
499 if not self._read_gzip_header():
500 self._size = self._pos
501 return b""
502 self._new_member = False
503
504 # Read a chunk of data from the file
505 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
506
507 uncompress = self._decompressor.decompress(buf, size)
508 if self._decompressor.unconsumed_tail != b"":
509 self._fp.prepend(self._decompressor.unconsumed_tail)
510 elif self._decompressor.unused_data != b"":
511 # Prepend the already read bytes to the fileobj so they can
512 # be seen by _read_eof() and _read_gzip_header()
513 self._fp.prepend(self._decompressor.unused_data)
514
515 if uncompress != b"":
516 break
517 if buf == b"":
518 raise EOFError("Compressed file ended before the "
519 "end-of-stream marker was reached")
520
521 self._add_read_data( uncompress )
522 self._pos += len(uncompress)
523 return uncompress
524
525 def _add_read_data(self, data):
526 self._crc = zlib.crc32(data, self._crc)
527 self._stream_size = self._stream_size + len(data)
528
529 def _read_eof(self):
530 # We've read to the end of the file
531 # We check that the computed CRC and size of the
532 # uncompressed data matches the stored values. Note that the size
533 # stored is the true file size mod 2**32.
534 crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
535 if crc32 != self._crc:
536 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
537 hex(self._crc)))
538 elif isize != (self._stream_size & 0xffffffff):
539 raise BadGzipFile("Incorrect length of data produced")
540
541 # Gzip files can be padded with zeroes and still have archives.
542 # Consume all zero bytes and set the file position to the first
543 # non-zero byte. See http://www.gzip.org/#faq8
544 c = b"\x00"
545 while c == b"\x00":
546 c = self._fp.read(1)
547 if c:
548 self._fp.prepend(c)
549
550 def _rewind(self):
551 super()._rewind()
552 self._new_member = True
553
554
555 def _create_simple_gzip_header(compresslevel: int,
556 mtime = None) -> bytes:
557 """
558 Write a simple gzip header with no extra fields.
559 :param compresslevel: Compresslevel used to determine the xfl bytes.
560 :param mtime: The mtime (must support conversion to a 32-bit integer).
561 :return: A bytes object representing the gzip header.
562 """
563 if mtime is None:
564 mtime = time.time()
565 if compresslevel == _COMPRESS_LEVEL_BEST:
566 xfl = 2
567 elif compresslevel == _COMPRESS_LEVEL_FAST:
568 xfl = 4
569 else:
570 xfl = 0
571 # Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
572 # fields added to header), mtime, xfl and os (255 for unknown OS).
573 return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)
574
575
576 def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
577 """Compress data in one shot and return the compressed string.
578
579 compresslevel sets the compression level in range of 0-9.
580 mtime can be used to set the modification time. The modification time is
581 set to the current time by default.
582 """
583 if mtime == 0:
584 # Use zlib as it creates the header with 0 mtime by default.
585 # This is faster and with less overhead.
586 return zlib.compress(data, level=compresslevel, wbits=31)
587 header = _create_simple_gzip_header(compresslevel, mtime)
588 trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
589 # Wbits=-15 creates a raw deflate block.
590 return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
591 trailer)
592
593
594 def decompress(data):
595 """Decompress a gzip compressed string in one shot.
596 Return the decompressed string.
597 """
598 decompressed_members = []
599 while True:
600 fp = io.BytesIO(data)
601 if _read_gzip_header(fp) is None:
602 return b"".join(decompressed_members)
603 # Use a zlib raw deflate compressor
604 do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
605 # Read all the data except the header
606 decompressed = do.decompress(data[fp.tell():])
607 if not do.eof or len(do.unused_data) < 8:
608 raise EOFError("Compressed file ended before the end-of-stream "
609 "marker was reached")
610 crc, length = struct.unpack("<II", do.unused_data[:8])
611 if crc != zlib.crc32(decompressed):
612 raise BadGzipFile("CRC check failed")
613 if length != (len(decompressed) & 0xffffffff):
614 raise BadGzipFile("Incorrect length of data produced")
615 decompressed_members.append(decompressed)
616 data = do.unused_data[8:].lstrip(b"\x00")
617
618
619 def main():
620 from argparse import ArgumentParser
621 parser = ArgumentParser(description=
622 "A simple command line interface for the gzip module: act like gzip, "
623 "but do not delete the input file.")
624 group = parser.add_mutually_exclusive_group()
625 group.add_argument('--fast', action='store_true', help='compress faster')
626 group.add_argument('--best', action='store_true', help='compress better')
627 group.add_argument("-d", "--decompress", action="store_true",
628 help="act like gunzip instead of gzip")
629
630 parser.add_argument("args", nargs="*", default=["-"], metavar='file')
631 args = parser.parse_args()
632
633 compresslevel = _COMPRESS_LEVEL_TRADEOFF
634 if args.fast:
635 compresslevel = _COMPRESS_LEVEL_FAST
636 elif args.best:
637 compresslevel = _COMPRESS_LEVEL_BEST
638
639 for arg in args.args:
640 if args.decompress:
641 if arg == "-":
642 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
643 g = sys.stdout.buffer
644 else:
645 if arg[-3:] != ".gz":
646 sys.exit(f"filename doesn't end in .gz: {arg!r}")
647 f = open(arg, "rb")
648 g = builtins.open(arg[:-3], "wb")
649 else:
650 if arg == "-":
651 f = sys.stdin.buffer
652 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
653 compresslevel=compresslevel)
654 else:
655 f = builtins.open(arg, "rb")
656 g = open(arg + ".gz", "wb")
657 while True:
658 chunk = f.read(io.DEFAULT_BUFFER_SIZE)
659 if not chunk:
660 break
661 g.write(chunk)
662 if g is not sys.stdout.buffer:
663 g.close()
664 if f is not sys.stdin.buffer:
665 f.close()
666
667 if __name__ == '__main__':
668 main()