(root)/
binutils-2.41/
zlib/
examples/
zran.c
       1  /* zran.c -- example of zlib/gzip stream indexing and random access
       2   * Copyright (C) 2005, 2012, 2018 Mark Adler
       3   * For conditions of distribution and use, see copyright notice in zlib.h
       4   * Version 1.2  14 Oct 2018  Mark Adler */
       5  
       6  /* Version History:
       7   1.0  29 May 2005  First version
       8   1.1  29 Sep 2012  Fix memory reallocation error
       9   1.2  14 Oct 2018  Handle gzip streams with multiple members
      10                     Add a header file to facilitate usage in applications
      11   */
      12  
      13  /* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
      14     for random access of a compressed file.  A file containing a zlib or gzip
      15     stream is provided on the command line.  The compressed stream is decoded in
      16     its entirety, and an index built with access points about every SPAN bytes
      17     in the uncompressed output.  The compressed file is left open, and can then
      18     be read randomly, having to decompress on the average SPAN/2 uncompressed
      19     bytes before getting to the desired block of data.
      20  
      21     An access point can be created at the start of any deflate block, by saving
      22     the starting file offset and bit of that block, and the 32K bytes of
      23     uncompressed data that precede that block.  Also the uncompressed offset of
      24     that block is saved to provide a referece for locating a desired starting
      25     point in the uncompressed stream.  deflate_index_build() works by
      26     decompressing the input zlib or gzip stream a block at a time, and at the
      27     end of each block deciding if enough uncompressed data has gone by to
      28     justify the creation of a new access point.  If so, that point is saved in a
      29     data structure that grows as needed to accommodate the points.
      30  
      31     To use the index, an offset in the uncompressed data is provided, for which
      32     the latest access point at or preceding that offset is located in the index.
      33     The input file is positioned to the specified location in the index, and if
      34     necessary the first few bits of the compressed data is read from the file.
      35     inflate is initialized with those bits and the 32K of uncompressed data, and
      36     the decompression then proceeds until the desired offset in the file is
      37     reached.  Then the decompression continues to read the desired uncompressed
      38     data from the file.
      39  
      40     Another approach would be to generate the index on demand.  In that case,
      41     requests for random access reads from the compressed data would try to use
      42     the index, but if a read far enough past the end of the index is required,
      43     then further index entries would be generated and added.
      44  
      45     There is some fair bit of overhead to starting inflation for the random
      46     access, mainly copying the 32K byte dictionary.  So if small pieces of the
      47     file are being accessed, it would make sense to implement a cache to hold
      48     some lookahead and avoid many calls to deflate_index_extract() for small
      49     lengths.
      50  
      51     Another way to build an index would be to use inflateCopy().  That would
      52     not be constrained to have access points at block boundaries, but requires
      53     more memory per access point, and also cannot be saved to file due to the
      54     use of pointers in the state.  The approach here allows for storage of the
      55     index in a file.
      56   */
      57  
      58  #include <stdio.h>
      59  #include <stdlib.h>
      60  #include <string.h>
      61  #include "zlib.h"
      62  #include "zran.h"
      63  
      64  #define WINSIZE 32768U      /* sliding window size */
      65  #define CHUNK 16384         /* file input buffer size */
      66  
      67  /* Access point entry. */
      68  struct point {
      69      off_t out;          /* corresponding offset in uncompressed data */
      70      off_t in;           /* offset in input file of first full byte */
      71      int bits;           /* number of bits (1-7) from byte at in-1, or 0 */
      72      unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
      73  };
      74  
      75  /* See comments in zran.h. */
      76  void deflate_index_free(struct deflate_index *index)
      77  {
      78      if (index != NULL) {
      79          free(index->list);
      80          free(index);
      81      }
      82  }
      83  
      84  /* Add an entry to the access point list. If out of memory, deallocate the
      85     existing list and return NULL. index->gzip is the allocated size of the
      86     index in point entries, until it is time for deflate_index_build() to
      87     return, at which point gzip is set to indicate a gzip file or not.
      88   */
      89  static struct deflate_index *addpoint(struct deflate_index *index, int bits,
      90                                        off_t in, off_t out, unsigned left,
      91                                        unsigned char *window)
      92  {
      93      struct point *next;
      94  
      95      /* if list is empty, create it (start with eight points) */
      96      if (index == NULL) {
      97          index = malloc(sizeof(struct deflate_index));
      98          if (index == NULL) return NULL;
      99          index->list = malloc(sizeof(struct point) << 3);
     100          if (index->list == NULL) {
     101              free(index);
     102              return NULL;
     103          }
     104          index->gzip = 8;
     105          index->have = 0;
     106      }
     107  
     108      /* if list is full, make it bigger */
     109      else if (index->have == index->gzip) {
     110          index->gzip <<= 1;
     111          next = realloc(index->list, sizeof(struct point) * index->gzip);
     112          if (next == NULL) {
     113              deflate_index_free(index);
     114              return NULL;
     115          }
     116          index->list = next;
     117      }
     118  
     119      /* fill in entry and increment how many we have */
     120      next = (struct point *)(index->list) + index->have;
     121      next->bits = bits;
     122      next->in = in;
     123      next->out = out;
     124      if (left)
     125          memcpy(next->window, window + WINSIZE - left, left);
     126      if (left < WINSIZE)
     127          memcpy(next->window + left, window, WINSIZE - left);
     128      index->have++;
     129  
     130      /* return list, possibly reallocated */
     131      return index;
     132  }
     133  
     134  /* See comments in zran.h. */
     135  int deflate_index_build(FILE *in, off_t span, struct deflate_index **built)
     136  {
     137      int ret;
     138      int gzip = 0;               /* true if reading a gzip file */
     139      off_t totin, totout;        /* our own total counters to avoid 4GB limit */
     140      off_t last;                 /* totout value of last access point */
     141      struct deflate_index *index;    /* access points being generated */
     142      z_stream strm;
     143      unsigned char input[CHUNK];
     144      unsigned char window[WINSIZE];
     145  
     146      /* initialize inflate */
     147      strm.zalloc = Z_NULL;
     148      strm.zfree = Z_NULL;
     149      strm.opaque = Z_NULL;
     150      strm.avail_in = 0;
     151      strm.next_in = Z_NULL;
     152      ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
     153      if (ret != Z_OK)
     154          return ret;
     155  
     156      /* inflate the input, maintain a sliding window, and build an index -- this
     157         also validates the integrity of the compressed data using the check
     158         information in the gzip or zlib stream */
     159      totin = totout = last = 0;
     160      index = NULL;               /* will be allocated by first addpoint() */
     161      strm.avail_out = 0;
     162      do {
     163          /* get some compressed data from input file */
     164          strm.avail_in = fread(input, 1, CHUNK, in);
     165          if (ferror(in)) {
     166              ret = Z_ERRNO;
     167              goto deflate_index_build_error;
     168          }
     169          if (strm.avail_in == 0) {
     170              ret = Z_DATA_ERROR;
     171              goto deflate_index_build_error;
     172          }
     173          strm.next_in = input;
     174  
     175          /* check for a gzip stream */
     176          if (totin == 0 && strm.avail_in >= 3 &&
     177              input[0] == 31 && input[1] == 139 && input[2] == 8)
     178              gzip = 1;
     179  
     180          /* process all of that, or until end of stream */
     181          do {
     182              /* reset sliding window if necessary */
     183              if (strm.avail_out == 0) {
     184                  strm.avail_out = WINSIZE;
     185                  strm.next_out = window;
     186              }
     187  
     188              /* inflate until out of input, output, or at end of block --
     189                 update the total input and output counters */
     190              totin += strm.avail_in;
     191              totout += strm.avail_out;
     192              ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
     193              totin -= strm.avail_in;
     194              totout -= strm.avail_out;
     195              if (ret == Z_NEED_DICT)
     196                  ret = Z_DATA_ERROR;
     197              if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
     198                  goto deflate_index_build_error;
     199              if (ret == Z_STREAM_END) {
     200                  if (gzip &&
     201                      (strm.avail_in || ungetc(getc(in), in) != EOF)) {
     202                      ret = inflateReset(&strm);
     203                      if (ret != Z_OK)
     204                          goto deflate_index_build_error;
     205                      continue;
     206                  }
     207                  break;
     208              }
     209  
     210              /* if at end of block, consider adding an index entry (note that if
     211                 data_type indicates an end-of-block, then all of the
     212                 uncompressed data from that block has been delivered, and none
     213                 of the compressed data after that block has been consumed,
     214                 except for up to seven bits) -- the totout == 0 provides an
     215                 entry point after the zlib or gzip header, and assures that the
     216                 index always has at least one access point; we avoid creating an
     217                 access point after the last block by checking bit 6 of data_type
     218               */
     219              if ((strm.data_type & 128) && !(strm.data_type & 64) &&
     220                  (totout == 0 || totout - last > span)) {
     221                  index = addpoint(index, strm.data_type & 7, totin,
     222                                   totout, strm.avail_out, window);
     223                  if (index == NULL) {
     224                      ret = Z_MEM_ERROR;
     225                      goto deflate_index_build_error;
     226                  }
     227                  last = totout;
     228              }
     229          } while (strm.avail_in != 0);
     230      } while (ret != Z_STREAM_END);
     231  
     232      /* clean up and return index (release unused entries in list) */
     233      (void)inflateEnd(&strm);
     234      index->list = realloc(index->list, sizeof(struct point) * index->have);
     235      index->gzip = gzip;
     236      index->length = totout;
     237      *built = index;
     238      return index->have;
     239  
     240      /* return error */
     241    deflate_index_build_error:
     242      (void)inflateEnd(&strm);
     243      deflate_index_free(index);
     244      return ret;
     245  }
     246  
     247  /* See comments in zran.h. */
     248  int deflate_index_extract(FILE *in, struct deflate_index *index, off_t offset,
     249                            unsigned char *buf, int len)
     250  {
     251      int ret, skip;
     252      z_stream strm;
     253      struct point *here;
     254      unsigned char input[CHUNK];
     255      unsigned char discard[WINSIZE];
     256  
     257      /* proceed only if something reasonable to do */
     258      if (len < 0)
     259          return 0;
     260  
     261      /* find where in stream to start */
     262      here = index->list;
     263      ret = index->have;
     264      while (--ret && here[1].out <= offset)
     265          here++;
     266  
     267      /* initialize file and inflate state to start there */
     268      strm.zalloc = Z_NULL;
     269      strm.zfree = Z_NULL;
     270      strm.opaque = Z_NULL;
     271      strm.avail_in = 0;
     272      strm.next_in = Z_NULL;
     273      ret = inflateInit2(&strm, -15);         /* raw inflate */
     274      if (ret != Z_OK)
     275          return ret;
     276      ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
     277      if (ret == -1)
     278          goto deflate_index_extract_ret;
     279      if (here->bits) {
     280          ret = getc(in);
     281          if (ret == -1) {
     282              ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
     283              goto deflate_index_extract_ret;
     284          }
     285          (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
     286      }
     287      (void)inflateSetDictionary(&strm, here->window, WINSIZE);
     288  
     289      /* skip uncompressed bytes until offset reached, then satisfy request */
     290      offset -= here->out;
     291      strm.avail_in = 0;
     292      skip = 1;                               /* while skipping to offset */
     293      do {
     294          /* define where to put uncompressed data, and how much */
     295          if (offset > WINSIZE) {             /* skip WINSIZE bytes */
     296              strm.avail_out = WINSIZE;
     297              strm.next_out = discard;
     298              offset -= WINSIZE;
     299          }
     300          else if (offset > 0) {              /* last skip */
     301              strm.avail_out = (unsigned)offset;
     302              strm.next_out = discard;
     303              offset = 0;
     304          }
     305          else if (skip) {                    /* at offset now */
     306              strm.avail_out = len;
     307              strm.next_out = buf;
     308              skip = 0;                       /* only do this once */
     309          }
     310  
     311          /* uncompress until avail_out filled, or end of stream */
     312          do {
     313              if (strm.avail_in == 0) {
     314                  strm.avail_in = fread(input, 1, CHUNK, in);
     315                  if (ferror(in)) {
     316                      ret = Z_ERRNO;
     317                      goto deflate_index_extract_ret;
     318                  }
     319                  if (strm.avail_in == 0) {
     320                      ret = Z_DATA_ERROR;
     321                      goto deflate_index_extract_ret;
     322                  }
     323                  strm.next_in = input;
     324              }
     325              ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
     326              if (ret == Z_NEED_DICT)
     327                  ret = Z_DATA_ERROR;
     328              if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
     329                  goto deflate_index_extract_ret;
     330              if (ret == Z_STREAM_END) {
     331                  /* the raw deflate stream has ended */
     332                  if (index->gzip == 0)
     333                      /* this is a zlib stream that has ended -- done */
     334                      break;
     335  
     336                  /* near the end of a gzip member, which might be followed by
     337                     another gzip member -- skip the gzip trailer and see if
     338                     there is more input after it */
     339                  if (strm.avail_in < 8) {
     340                      fseeko(in, 8 - strm.avail_in, SEEK_CUR);
     341                      strm.avail_in = 0;
     342                  }
     343                  else {
     344                      strm.avail_in -= 8;
     345                      strm.next_in += 8;
     346                  }
     347                  if (strm.avail_in == 0 && ungetc(getc(in), in) == EOF)
     348                      /* the input ended after the gzip trailer -- done */
     349                      break;
     350  
     351                  /* there is more input, so another gzip member should follow --
     352                     validate and skip the gzip header */
     353                  ret = inflateReset2(&strm, 31);
     354                  if (ret != Z_OK)
     355                      goto deflate_index_extract_ret;
     356                  do {
     357                      if (strm.avail_in == 0) {
     358                          strm.avail_in = fread(input, 1, CHUNK, in);
     359                          if (ferror(in)) {
     360                              ret = Z_ERRNO;
     361                              goto deflate_index_extract_ret;
     362                          }
     363                          if (strm.avail_in == 0) {
     364                              ret = Z_DATA_ERROR;
     365                              goto deflate_index_extract_ret;
     366                          }
     367                          strm.next_in = input;
     368                      }
     369                      ret = inflate(&strm, Z_BLOCK);
     370                      if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
     371                          goto deflate_index_extract_ret;
     372                  } while ((strm.data_type & 128) == 0);
     373  
     374                  /* set up to continue decompression of the raw deflate stream
     375                     that follows the gzip header */
     376                  ret = inflateReset2(&strm, -15);
     377                  if (ret != Z_OK)
     378                      goto deflate_index_extract_ret;
     379              }
     380  
     381              /* continue to process the available input before reading more */
     382          } while (strm.avail_out != 0);
     383  
     384          if (ret == Z_STREAM_END)
     385              /* reached the end of the compressed data -- return the data that
     386                 was available, possibly less than requested */
     387              break;
     388  
     389          /* do until offset reached and requested data read */
     390      } while (skip);
     391  
     392      /* compute the number of uncompressed bytes read after the offset */
     393      ret = skip ? 0 : len - strm.avail_out;
     394  
     395      /* clean up and return the bytes read, or the negative error */
     396    deflate_index_extract_ret:
     397      (void)inflateEnd(&strm);
     398      return ret;
     399  }
     400  
     401  #ifdef TEST
     402  
     403  #define SPAN 1048576L       /* desired distance between access points */
     404  #define LEN 16384           /* number of bytes to extract */
     405  
     406  /* Demonstrate the use of deflate_index_build() and deflate_index_extract() by
     407     processing the file provided on the command line, and extracting LEN bytes
     408     from 2/3rds of the way through the uncompressed output, writing that to
     409     stdout. An offset can be provided as the second argument, in which case the
     410     data is extracted from there instead. */
     411  int main(int argc, char **argv)
     412  {
     413      int len;
     414      off_t offset = -1;
     415      FILE *in;
     416      struct deflate_index *index = NULL;
     417      unsigned char buf[LEN];
     418  
     419      /* open input file */
     420      if (argc < 2 || argc > 3) {
     421          fprintf(stderr, "usage: zran file.gz [offset]\n");
     422          return 1;
     423      }
     424      in = fopen(argv[1], "rb");
     425      if (in == NULL) {
     426          fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
     427          return 1;
     428      }
     429  
     430      /* get optional offset */
     431      if (argc == 3) {
     432          char *end;
     433          offset = strtoll(argv[2], &end, 10);
     434          if (*end || offset < 0) {
     435              fprintf(stderr, "zran: %s is not a valid offset\n", argv[2]);
     436              return 1;
     437          }
     438      }
     439  
     440      /* build index */
     441      len = deflate_index_build(in, SPAN, &index);
     442      if (len < 0) {
     443          fclose(in);
     444          switch (len) {
     445          case Z_MEM_ERROR:
     446              fprintf(stderr, "zran: out of memory\n");
     447              break;
     448          case Z_DATA_ERROR:
     449              fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
     450              break;
     451          case Z_ERRNO:
     452              fprintf(stderr, "zran: read error on %s\n", argv[1]);
     453              break;
     454          default:
     455              fprintf(stderr, "zran: error %d while building index\n", len);
     456          }
     457          return 1;
     458      }
     459      fprintf(stderr, "zran: built index with %d access points\n", len);
     460  
     461      /* use index by reading some bytes from an arbitrary offset */
     462      if (offset == -1)
     463          offset = (index->length << 1) / 3;
     464      len = deflate_index_extract(in, index, offset, buf, LEN);
     465      if (len < 0)
     466          fprintf(stderr, "zran: extraction failed: %s error\n",
     467                  len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
     468      else {
     469          fwrite(buf, 1, len, stdout);
     470          fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
     471      }
     472  
     473      /* clean up and exit */
     474      deflate_index_free(index);
     475      fclose(in);
     476      return 0;
     477  }
     478  
     479  #endif