(root)/
freetype-2.13.2/
src/
gzip/
crc32.c
       1  /* crc32.c -- compute the CRC-32 of a data stream
       2   * Copyright (C) 1995-2022 Mark Adler
       3   * For conditions of distribution and use, see copyright notice in zlib.h
       4   *
       5   * This interleaved implementation of a CRC makes use of pipelined multiple
       6   * arithmetic-logic units, commonly found in modern CPU cores. It is due to
       7   * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
       8   */
       9  
      10  /* @(#) $Id$ */
      11  
      12  /*
      13    Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
      14    protection on the static variables used to control the first-use generation
      15    of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
      16    first call get_crc_table() to initialize the tables before allowing more than
      17    one thread to use crc32().
      18  
      19    MAKECRCH can be #defined to write out crc32.h. A main() routine is also
      20    produced, so that this one source file can be compiled to an executable.
      21   */
      22  
      23  #ifdef MAKECRCH
      24  #  include <stdio.h>
      25  #  ifndef DYNAMIC_CRC_TABLE
      26  #    define DYNAMIC_CRC_TABLE
      27  #  endif /* !DYNAMIC_CRC_TABLE */
      28  #endif /* MAKECRCH */
      29  
      30  #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
      31  
      32   /*
      33    A CRC of a message is computed on N braids of words in the message, where
      34    each word consists of W bytes (4 or 8). If N is 3, for example, then three
      35    running sparse CRCs are calculated respectively on each braid, at these
      36    indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
      37    This is done starting at a word boundary, and continues until as many blocks
      38    of N * W bytes as are available have been processed. The results are combined
      39    into a single CRC at the end. For this code, N must be in the range 1..6 and
      40    W must be 4 or 8. The upper limit on N can be increased if desired by adding
      41    more #if blocks, extending the patterns apparent in the code. In addition,
      42    crc32.h would need to be regenerated, if the maximum N value is increased.
      43  
      44    N and W are chosen empirically by benchmarking the execution time on a given
      45    processor. The choices for N and W below were based on testing on Intel Kaby
      46    Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
      47    Octeon II processors. The Intel, AMD, and ARM processors were all fastest
      48    with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
      49    They were all tested with either gcc or clang, all using the -O3 optimization
      50    level. Your mileage may vary.
      51   */
      52  
      53  /* Define N */
      54  #ifdef Z_TESTN
      55  #  define N Z_TESTN
      56  #else
      57  #  define N 5
      58  #endif
      59  #if N < 1 || N > 6
      60  #  error N must be in 1..6
      61  #endif
      62  
      63  /*
      64    z_crc_t must be at least 32 bits. z_word_t must be at least as long as
      65    z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
      66    that bytes are eight bits.
      67   */
      68  
      69  /*
      70    Define W and the associated z_word_t type. If W is not defined, then a
      71    braided calculation is not used, and the associated tables and code are not
      72    compiled.
      73   */
      74  #ifdef Z_TESTW
      75  #  if Z_TESTW-1 != -1
      76  #    define W Z_TESTW
      77  #  endif
      78  #else
      79  #  ifdef MAKECRCH
      80  #    define W 8         /* required for MAKECRCH */
      81  #  else
      82  #    if defined(__x86_64__) || defined(__aarch64__)
      83  #      define W 8
      84  #    else
      85  #      define W 4
      86  #    endif
      87  #  endif
      88  #endif
      89  #ifdef W
      90  #  if W == 8 && defined(Z_U8)
      91       typedef Z_U8 z_word_t;
      92  #  elif defined(Z_U4)
      93  #    undef W
      94  #    define W 4
      95       typedef Z_U4 z_word_t;
      96  #  else
      97  #    undef W
      98  #  endif
      99  #endif
     100  
     101  /* If available, use the ARM processor CRC32 instruction. */
     102  #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
     103  #  define ARMCRC32
     104  #endif
     105  
     106  #ifndef Z_FREETYPE
     107  /* Local functions. */
     108  local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
     109  local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
     110  #endif  /* Z_FREETYPE */
     111  
     112  #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
     113      local z_word_t byte_swap OF((z_word_t word));
     114  #endif
     115  
     116  #if defined(W) && !defined(ARMCRC32)
     117      local z_crc_t crc_word OF((z_word_t data));
     118      local z_word_t crc_word_big OF((z_word_t data));
     119  #endif
     120  
     121  #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
     122  /*
     123    Swap the bytes in a z_word_t to convert between little and big endian. Any
     124    self-respecting compiler will optimize this to a single machine byte-swap
     125    instruction, if one is available. This assumes that word_t is either 32 bits
     126    or 64 bits.
     127   */
     128  local z_word_t byte_swap(
     129      z_word_t word)
     130  {
     131  #  if W == 8
     132      return
     133          (word & 0xff00000000000000) >> 56 |
     134          (word & 0xff000000000000) >> 40 |
     135          (word & 0xff0000000000) >> 24 |
     136          (word & 0xff00000000) >> 8 |
     137          (word & 0xff000000) << 8 |
     138          (word & 0xff0000) << 24 |
     139          (word & 0xff00) << 40 |
     140          (word & 0xff) << 56;
     141  #  else   /* W == 4 */
     142      return
     143          (word & 0xff000000) >> 24 |
     144          (word & 0xff0000) >> 8 |
     145          (word & 0xff00) << 8 |
     146          (word & 0xff) << 24;
     147  #  endif
     148  }
     149  #endif
     150  
     151  /* CRC polynomial. */
     152  #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
     153  
     154  #ifdef DYNAMIC_CRC_TABLE
     155  
     156  local z_crc_t FAR crc_table[256];
     157  local z_crc_t FAR x2n_table[32];
     158  local void make_crc_table OF((void));
     159  #ifdef W
     160     local z_word_t FAR crc_big_table[256];
     161     local z_crc_t FAR crc_braid_table[W][256];
     162     local z_word_t FAR crc_braid_big_table[W][256];
     163     local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
     164  #endif
     165  #ifdef MAKECRCH
     166     local void write_table OF((FILE *, const z_crc_t FAR *, int));
     167     local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
     168     local void write_table64 OF((FILE *, const z_word_t FAR *, int));
     169  #endif /* MAKECRCH */
     170  
     171  /*
     172    Define a once() function depending on the availability of atomics. If this is
     173    compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
     174    multiple threads, and if atomics are not available, then get_crc_table() must
     175    be called to initialize the tables and must return before any threads are
     176    allowed to compute or combine CRCs.
     177   */
     178  
     179  /* Definition of once functionality. */
     180  typedef struct once_s once_t;
     181  local void once OF((once_t *, void (*)(void)));
     182  
     183  /* Check for the availability of atomics. */
     184  #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
     185      !defined(__STDC_NO_ATOMICS__)
     186  
     187  #include <stdatomic.h>
     188  
     189  /* Structure for once(), which must be initialized with ONCE_INIT. */
     190  struct once_s {
     191      atomic_flag begun;
     192      atomic_int done;
     193  };
     194  #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
     195  
     196  /*
     197    Run the provided init() function exactly once, even if multiple threads
     198    invoke once() at the same time. The state must be a once_t initialized with
     199    ONCE_INIT.
     200   */
     201  local void once(state, init)
     202      once_t *state;
     203      void (*init)(void);
     204  {
     205      if (!atomic_load(&state->done)) {
     206          if (atomic_flag_test_and_set(&state->begun))
     207              while (!atomic_load(&state->done))
     208                  ;
     209          else {
     210              init();
     211              atomic_store(&state->done, 1);
     212          }
     213      }
     214  }
     215  
     216  #else   /* no atomics */
     217  
     218  /* Structure for once(), which must be initialized with ONCE_INIT. */
     219  struct once_s {
     220      volatile int begun;
     221      volatile int done;
     222  };
     223  #define ONCE_INIT {0, 0}
     224  
     225  /* Test and set. Alas, not atomic, but tries to minimize the period of
     226     vulnerability. */
     227  local int test_and_set OF((int volatile *));
     228  local int test_and_set(
     229      int volatile *flag)
     230  {
     231      int was;
     232  
     233      was = *flag;
     234      *flag = 1;
     235      return was;
     236  }
     237  
     238  /* Run the provided init() function once. This is not thread-safe. */
     239  local void once(state, init)
     240      once_t *state;
     241      void (*init)(void);
     242  {
     243      if (!state->done) {
     244          if (test_and_set(&state->begun))
     245              while (!state->done)
     246                  ;
     247          else {
     248              init();
     249              state->done = 1;
     250          }
     251      }
     252  }
     253  
     254  #endif
     255  
     256  /* State for once(). */
     257  local once_t made = ONCE_INIT;
     258  
     259  /*
     260    Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
     261    x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
     262  
     263    Polynomials over GF(2) are represented in binary, one bit per coefficient,
     264    with the lowest powers in the most significant bit. Then adding polynomials
     265    is just exclusive-or, and multiplying a polynomial by x is a right shift by
     266    one. If we call the above polynomial p, and represent a byte as the
     267    polynomial q, also with the lowest power in the most significant bit (so the
     268    byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
     269    where a mod b means the remainder after dividing a by b.
     270  
     271    This calculation is done using the shift-register method of multiplying and
     272    taking the remainder. The register is initialized to zero, and for each
     273    incoming bit, x^32 is added mod p to the register if the bit is a one (where
     274    x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
     275    (which is shifting right by one and adding x^32 mod p if the bit shifted out
     276    is a one). We start with the highest power (least significant bit) of q and
     277    repeat for all eight bits of q.
     278  
     279    The table is simply the CRC of all possible eight bit values. This is all the
     280    information needed to generate CRCs on data a byte at a time for all
     281    combinations of CRC register values and incoming bytes.
     282   */
     283  
     284  local void make_crc_table()
     285  {
     286      unsigned i, j, n;
     287      z_crc_t p;
     288  
     289      /* initialize the CRC of bytes tables */
     290      for (i = 0; i < 256; i++) {
     291          p = i;
     292          for (j = 0; j < 8; j++)
     293              p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
     294          crc_table[i] = p;
     295  #ifdef W
     296          crc_big_table[i] = byte_swap(p);
     297  #endif
     298      }
     299  
     300      /* initialize the x^2^n mod p(x) table */
     301      p = (z_crc_t)1 << 30;         /* x^1 */
     302      x2n_table[0] = p;
     303      for (n = 1; n < 32; n++)
     304          x2n_table[n] = p = multmodp(p, p);
     305  
     306  #ifdef W
     307      /* initialize the braiding tables -- needs x2n_table[] */
     308      braid(crc_braid_table, crc_braid_big_table, N, W);
     309  #endif
     310  
     311  #ifdef MAKECRCH
     312      {
     313          /*
     314            The crc32.h header file contains tables for both 32-bit and 64-bit
     315            z_word_t's, and so requires a 64-bit type be available. In that case,
     316            z_word_t must be defined to be 64-bits. This code then also generates
     317            and writes out the tables for the case that z_word_t is 32 bits.
     318           */
     319  #if !defined(W) || W != 8
     320  #  error Need a 64-bit integer type in order to generate crc32.h.
     321  #endif
     322          FILE *out;
     323          int k, n;
     324          z_crc_t ltl[8][256];
     325          z_word_t big[8][256];
     326  
     327          out = fopen("crc32.h", "w");
     328          if (out == NULL) return;
     329  
     330          /* write out little-endian CRC table to crc32.h */
     331          fprintf(out,
     332              "/* crc32.h -- tables for rapid CRC calculation\n"
     333              " * Generated automatically by crc32.c\n */\n"
     334              "\n"
     335              "local const z_crc_t FAR crc_table[] = {\n"
     336              "    ");
     337          write_table(out, crc_table, 256);
     338          fprintf(out,
     339              "};\n");
     340  
     341          /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
     342          fprintf(out,
     343              "\n"
     344              "#ifdef W\n"
     345              "\n"
     346              "#if W == 8\n"
     347              "\n"
     348              "local const z_word_t FAR crc_big_table[] = {\n"
     349              "    ");
     350          write_table64(out, crc_big_table, 256);
     351          fprintf(out,
     352              "};\n");
     353  
     354          /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
     355          fprintf(out,
     356              "\n"
     357              "#else /* W == 4 */\n"
     358              "\n"
     359              "local const z_word_t FAR crc_big_table[] = {\n"
     360              "    ");
     361          write_table32hi(out, crc_big_table, 256);
     362          fprintf(out,
     363              "};\n"
     364              "\n"
     365              "#endif\n");
     366  
     367          /* write out braid tables for each value of N */
     368          for (n = 1; n <= 6; n++) {
     369              fprintf(out,
     370              "\n"
     371              "#if N == %d\n", n);
     372  
     373              /* compute braid tables for this N and 64-bit word_t */
     374              braid(ltl, big, n, 8);
     375  
     376              /* write out braid tables for 64-bit z_word_t to crc32.h */
     377              fprintf(out,
     378              "\n"
     379              "#if W == 8\n"
     380              "\n"
     381              "local const z_crc_t FAR crc_braid_table[][256] = {\n");
     382              for (k = 0; k < 8; k++) {
     383                  fprintf(out, "   {");
     384                  write_table(out, ltl[k], 256);
     385                  fprintf(out, "}%s", k < 7 ? ",\n" : "");
     386              }
     387              fprintf(out,
     388              "};\n"
     389              "\n"
     390              "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
     391              for (k = 0; k < 8; k++) {
     392                  fprintf(out, "   {");
     393                  write_table64(out, big[k], 256);
     394                  fprintf(out, "}%s", k < 7 ? ",\n" : "");
     395              }
     396              fprintf(out,
     397              "};\n");
     398  
     399              /* compute braid tables for this N and 32-bit word_t */
     400              braid(ltl, big, n, 4);
     401  
     402              /* write out braid tables for 32-bit z_word_t to crc32.h */
     403              fprintf(out,
     404              "\n"
     405              "#else /* W == 4 */\n"
     406              "\n"
     407              "local const z_crc_t FAR crc_braid_table[][256] = {\n");
     408              for (k = 0; k < 4; k++) {
     409                  fprintf(out, "   {");
     410                  write_table(out, ltl[k], 256);
     411                  fprintf(out, "}%s", k < 3 ? ",\n" : "");
     412              }
     413              fprintf(out,
     414              "};\n"
     415              "\n"
     416              "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
     417              for (k = 0; k < 4; k++) {
     418                  fprintf(out, "   {");
     419                  write_table32hi(out, big[k], 256);
     420                  fprintf(out, "}%s", k < 3 ? ",\n" : "");
     421              }
     422              fprintf(out,
     423              "};\n"
     424              "\n"
     425              "#endif\n"
     426              "\n"
     427              "#endif\n");
     428          }
     429          fprintf(out,
     430              "\n"
     431              "#endif\n");
     432  
     433          /* write out zeros operator table to crc32.h */
     434          fprintf(out,
     435              "\n"
     436              "local const z_crc_t FAR x2n_table[] = {\n"
     437              "    ");
     438          write_table(out, x2n_table, 32);
     439          fprintf(out,
     440              "};\n");
     441          fclose(out);
     442      }
     443  #endif /* MAKECRCH */
     444  }
     445  
     446  #ifdef MAKECRCH
     447  
     448  /*
     449     Write the 32-bit values in table[0..k-1] to out, five per line in
     450     hexadecimal separated by commas.
     451   */
     452  local void write_table(
     453      FILE *out,
     454      const z_crc_t FAR *table,
     455      int k)
     456  {
     457      int n;
     458  
     459      for (n = 0; n < k; n++)
     460          fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
     461                  (unsigned long)(table[n]),
     462                  n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
     463  }
     464  
     465  /*
     466     Write the high 32-bits of each value in table[0..k-1] to out, five per line
     467     in hexadecimal separated by commas.
     468   */
     469  local void write_table32hi(
     470      FILE *out,
     471      const z_word_t FAR *table,
     472      int k)
     473  {
     474      int n;
     475  
     476      for (n = 0; n < k; n++)
     477          fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
     478                  (unsigned long)(table[n] >> 32),
     479                  n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
     480  }
     481  
     482  /*
     483    Write the 64-bit values in table[0..k-1] to out, three per line in
     484    hexadecimal separated by commas. This assumes that if there is a 64-bit
     485    type, then there is also a long long integer type, and it is at least 64
     486    bits. If not, then the type cast and format string can be adjusted
     487    accordingly.
     488   */
     489  local void write_table64(
     490      FILE *out,
     491      const z_word_t FAR *table,
     492      int k)
     493  {
     494      int n;
     495  
     496      for (n = 0; n < k; n++)
     497          fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
     498                  (unsigned long long)(table[n]),
     499                  n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
     500  }
     501  
     502  /* Actually do the deed. */
     503  int main()
     504  {
     505      make_crc_table();
     506      return 0;
     507  }
     508  
     509  #endif /* MAKECRCH */
     510  
     511  #ifdef W
     512  /*
     513    Generate the little and big-endian braid tables for the given n and z_word_t
     514    size w. Each array must have room for w blocks of 256 elements.
     515   */
     516  local void braid(ltl, big, n, w)
     517      z_crc_t ltl[][256];
     518      z_word_t big[][256];
     519      int n;
     520      int w;
     521  {
     522      int k;
     523      z_crc_t i, p, q;
     524      for (k = 0; k < w; k++) {
     525          p = x2nmodp((n * w + 3 - k) << 3, 0);
     526          ltl[k][0] = 0;
     527          big[w - 1 - k][0] = 0;
     528          for (i = 1; i < 256; i++) {
     529              ltl[k][i] = q = multmodp(i << 24, p);
     530              big[w - 1 - k][i] = byte_swap(q);
     531          }
     532      }
     533  }
     534  #endif
     535  
     536  #else /* !DYNAMIC_CRC_TABLE */
     537  /* ========================================================================
     538   * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
     539   * of x for combining CRC-32s, all made by make_crc_table().
     540   */
     541  #include "crc32.h"
     542  #endif /* DYNAMIC_CRC_TABLE */
     543  
     544  /* ========================================================================
     545   * Routines used for CRC calculation. Some are also required for the table
     546   * generation above.
     547   */
     548  
     549  #ifndef Z_FREETYPE
     550  
     551  /*
     552    Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
     553    reflected. For speed, this requires that a not be zero.
     554   */
     555  local z_crc_t multmodp(
     556      z_crc_t a,
     557      z_crc_t b)
     558  {
     559      z_crc_t m, p;
     560  
     561      m = (z_crc_t)1 << 31;
     562      p = 0;
     563      for (;;) {
     564          if (a & m) {
     565              p ^= b;
     566              if ((a & (m - 1)) == 0)
     567                  break;
     568          }
     569          m >>= 1;
     570          b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
     571      }
     572      return p;
     573  }
     574  
     575  /*
     576    Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
     577    initialized.
     578   */
     579  local z_crc_t x2nmodp(
     580      z_off64_t n,
     581      unsigned k)
     582  {
     583      z_crc_t p;
     584  
     585      p = (z_crc_t)1 << 31;           /* x^0 == 1 */
     586      while (n) {
     587          if (n & 1)
     588              p = multmodp(x2n_table[k & 31], p);
     589          n >>= 1;
     590          k++;
     591      }
     592      return p;
     593  }
     594  
     595  /* =========================================================================
     596   * This function can be used by asm versions of crc32(), and to force the
     597   * generation of the CRC tables in a threaded application.
     598   */
     599  const z_crc_t FAR * ZEXPORT get_crc_table()
     600  {
     601  #ifdef DYNAMIC_CRC_TABLE
     602      once(&made, make_crc_table);
     603  #endif /* DYNAMIC_CRC_TABLE */
     604      return (const z_crc_t FAR *)crc_table;
     605  }
     606  
     607  #endif  /* Z_FREETYPE */
     608  
     609  /* =========================================================================
     610   * Use ARM machine instructions if available. This will compute the CRC about
     611   * ten times faster than the braided calculation. This code does not check for
     612   * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
     613   * only be defined if the compilation specifies an ARM processor architecture
     614   * that has the instructions. For example, compiling with -march=armv8.1-a or
     615   * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
     616   * instructions.
     617   */
     618  #ifdef ARMCRC32
     619  
     620  /*
     621     Constants empirically determined to maximize speed. These values are from
     622     measurements on a Cortex-A57. Your mileage may vary.
     623   */
     624  #define Z_BATCH 3990                /* number of words in a batch */
     625  #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
     626  #define Z_BATCH_MIN 800             /* fewest words in a final batch */
     627  
     628  unsigned long ZEXPORT crc32_z(
     629      unsigned long crc,
     630      const unsigned char FAR *buf,
     631      z_size_t len)
     632  {
     633      z_crc_t val;
     634      z_word_t crc1, crc2;
     635      const z_word_t *word;
     636      z_word_t val0, val1, val2;
     637      z_size_t last, last2, i;
     638      z_size_t num;
     639  
     640      /* Return initial CRC, if requested. */
     641      if (buf == Z_NULL) return 0;
     642  
     643  #ifdef DYNAMIC_CRC_TABLE
     644      once(&made, make_crc_table);
     645  #endif /* DYNAMIC_CRC_TABLE */
     646  
     647      /* Pre-condition the CRC */
     648      crc = (~crc) & 0xffffffff;
     649  
     650      /* Compute the CRC up to a word boundary. */
     651      while (len && ((z_size_t)buf & 7) != 0) {
     652          len--;
     653          val = *buf++;
     654          __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
     655      }
     656  
     657      /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
     658      word = (z_word_t const *)buf;
     659      num = len >> 3;
     660      len &= 7;
     661  
     662      /* Do three interleaved CRCs to realize the throughput of one crc32x
     663         instruction per cycle. Each CRC is calculated on Z_BATCH words. The
     664         three CRCs are combined into a single CRC after each set of batches. */
     665      while (num >= 3 * Z_BATCH) {
     666          crc1 = 0;
     667          crc2 = 0;
     668          for (i = 0; i < Z_BATCH; i++) {
     669              val0 = word[i];
     670              val1 = word[i + Z_BATCH];
     671              val2 = word[i + 2 * Z_BATCH];
     672              __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
     673              __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
     674              __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
     675          }
     676          word += 3 * Z_BATCH;
     677          num -= 3 * Z_BATCH;
     678          crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
     679          crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
     680      }
     681  
     682      /* Do one last smaller batch with the remaining words, if there are enough
     683         to pay for the combination of CRCs. */
     684      last = num / 3;
     685      if (last >= Z_BATCH_MIN) {
     686          last2 = last << 1;
     687          crc1 = 0;
     688          crc2 = 0;
     689          for (i = 0; i < last; i++) {
     690              val0 = word[i];
     691              val1 = word[i + last];
     692              val2 = word[i + last2];
     693              __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
     694              __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
     695              __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
     696          }
     697          word += 3 * last;
     698          num -= 3 * last;
     699          val = x2nmodp(last, 6);
     700          crc = multmodp(val, crc) ^ crc1;
     701          crc = multmodp(val, crc) ^ crc2;
     702      }
     703  
     704      /* Compute the CRC on any remaining words. */
     705      for (i = 0; i < num; i++) {
     706          val0 = word[i];
     707          __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
     708      }
     709      word += num;
     710  
     711      /* Complete the CRC on any remaining bytes. */
     712      buf = (const unsigned char FAR *)word;
     713      while (len) {
     714          len--;
     715          val = *buf++;
     716          __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
     717      }
     718  
     719      /* Return the CRC, post-conditioned. */
     720      return crc ^ 0xffffffff;
     721  }
     722  
     723  #else
     724  
     725  #ifdef W
     726  
     727  /*
     728    Return the CRC of the W bytes in the word_t data, taking the
     729    least-significant byte of the word as the first byte of data, without any pre
     730    or post conditioning. This is used to combine the CRCs of each braid.
     731   */
     732  local z_crc_t crc_word(
     733      z_word_t data)
     734  {
     735      int k;
     736      for (k = 0; k < W; k++)
     737          data = (data >> 8) ^ crc_table[data & 0xff];
     738      return (z_crc_t)data;
     739  }
     740  
     741  local z_word_t crc_word_big(
     742      z_word_t data)
     743  {
     744      int k;
     745      for (k = 0; k < W; k++)
     746          data = (data << 8) ^
     747              crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
     748      return data;
     749  }
     750  
     751  #endif
     752  
     753  /* ========================================================================= */
     754  unsigned long ZEXPORT crc32_z(
     755      unsigned long crc,
     756      const unsigned char FAR *buf,
     757      z_size_t len)
     758  {
     759      /* Return initial CRC, if requested. */
     760      if (buf == Z_NULL) return 0;
     761  
     762  #ifdef DYNAMIC_CRC_TABLE
     763      once(&made, make_crc_table);
     764  #endif /* DYNAMIC_CRC_TABLE */
     765  
     766      /* Pre-condition the CRC */
     767      crc = (~crc) & 0xffffffff;
     768  
     769  #ifdef W
     770  
     771      /* If provided enough bytes, do a braided CRC calculation. */
     772      if (len >= N * W + W - 1) {
     773          z_size_t blks;
     774          z_word_t const *words;
     775          unsigned endian;
     776          int k;
     777  
     778          /* Compute the CRC up to a z_word_t boundary. */
     779          while (len && ((z_size_t)buf & (W - 1)) != 0) {
     780              len--;
     781              crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
     782          }
     783  
     784          /* Compute the CRC on as many N z_word_t blocks as are available. */
     785          blks = len / (N * W);
     786          len -= blks * N * W;
     787          words = (z_word_t const *)buf;
     788  
     789          /* Do endian check at execution time instead of compile time, since ARM
     790             processors can change the endianess at execution time. If the
     791             compiler knows what the endianess will be, it can optimize out the
     792             check and the unused branch. */
     793          endian = 1;
     794          if (*(unsigned char *)&endian) {
     795              /* Little endian. */
     796  
     797              z_crc_t crc0;
     798              z_word_t word0;
     799  #if N > 1
     800              z_crc_t crc1;
     801              z_word_t word1;
     802  #if N > 2
     803              z_crc_t crc2;
     804              z_word_t word2;
     805  #if N > 3
     806              z_crc_t crc3;
     807              z_word_t word3;
     808  #if N > 4
     809              z_crc_t crc4;
     810              z_word_t word4;
     811  #if N > 5
     812              z_crc_t crc5;
     813              z_word_t word5;
     814  #endif
     815  #endif
     816  #endif
     817  #endif
     818  #endif
     819  
     820              /* Initialize the CRC for each braid. */
     821              crc0 = crc;
     822  #if N > 1
     823              crc1 = 0;
     824  #if N > 2
     825              crc2 = 0;
     826  #if N > 3
     827              crc3 = 0;
     828  #if N > 4
     829              crc4 = 0;
     830  #if N > 5
     831              crc5 = 0;
     832  #endif
     833  #endif
     834  #endif
     835  #endif
     836  #endif
     837  
     838              /*
     839                Process the first blks-1 blocks, computing the CRCs on each braid
     840                independently.
     841               */
     842              while (--blks) {
     843                  /* Load the word for each braid into registers. */
     844                  word0 = crc0 ^ words[0];
     845  #if N > 1
     846                  word1 = crc1 ^ words[1];
     847  #if N > 2
     848                  word2 = crc2 ^ words[2];
     849  #if N > 3
     850                  word3 = crc3 ^ words[3];
     851  #if N > 4
     852                  word4 = crc4 ^ words[4];
     853  #if N > 5
     854                  word5 = crc5 ^ words[5];
     855  #endif
     856  #endif
     857  #endif
     858  #endif
     859  #endif
     860                  words += N;
     861  
     862                  /* Compute and update the CRC for each word. The loop should
     863                     get unrolled. */
     864                  crc0 = crc_braid_table[0][word0 & 0xff];
     865  #if N > 1
     866                  crc1 = crc_braid_table[0][word1 & 0xff];
     867  #if N > 2
     868                  crc2 = crc_braid_table[0][word2 & 0xff];
     869  #if N > 3
     870                  crc3 = crc_braid_table[0][word3 & 0xff];
     871  #if N > 4
     872                  crc4 = crc_braid_table[0][word4 & 0xff];
     873  #if N > 5
     874                  crc5 = crc_braid_table[0][word5 & 0xff];
     875  #endif
     876  #endif
     877  #endif
     878  #endif
     879  #endif
     880                  for (k = 1; k < W; k++) {
     881                      crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
     882  #if N > 1
     883                      crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
     884  #if N > 2
     885                      crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
     886  #if N > 3
     887                      crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
     888  #if N > 4
     889                      crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
     890  #if N > 5
     891                      crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
     892  #endif
     893  #endif
     894  #endif
     895  #endif
     896  #endif
     897                  }
     898              }
     899  
     900              /*
     901                Process the last block, combining the CRCs of the N braids at the
     902                same time.
     903               */
     904              crc = crc_word(crc0 ^ words[0]);
     905  #if N > 1
     906              crc = crc_word(crc1 ^ words[1] ^ crc);
     907  #if N > 2
     908              crc = crc_word(crc2 ^ words[2] ^ crc);
     909  #if N > 3
     910              crc = crc_word(crc3 ^ words[3] ^ crc);
     911  #if N > 4
     912              crc = crc_word(crc4 ^ words[4] ^ crc);
     913  #if N > 5
     914              crc = crc_word(crc5 ^ words[5] ^ crc);
     915  #endif
     916  #endif
     917  #endif
     918  #endif
     919  #endif
     920              words += N;
     921          }
     922          else {
     923              /* Big endian. */
     924  
     925              z_word_t crc0, word0, comb;
     926  #if N > 1
     927              z_word_t crc1, word1;
     928  #if N > 2
     929              z_word_t crc2, word2;
     930  #if N > 3
     931              z_word_t crc3, word3;
     932  #if N > 4
     933              z_word_t crc4, word4;
     934  #if N > 5
     935              z_word_t crc5, word5;
     936  #endif
     937  #endif
     938  #endif
     939  #endif
     940  #endif
     941  
     942              /* Initialize the CRC for each braid. */
     943              crc0 = byte_swap(crc);
     944  #if N > 1
     945              crc1 = 0;
     946  #if N > 2
     947              crc2 = 0;
     948  #if N > 3
     949              crc3 = 0;
     950  #if N > 4
     951              crc4 = 0;
     952  #if N > 5
     953              crc5 = 0;
     954  #endif
     955  #endif
     956  #endif
     957  #endif
     958  #endif
     959  
     960              /*
     961                Process the first blks-1 blocks, computing the CRCs on each braid
     962                independently.
     963               */
     964              while (--blks) {
     965                  /* Load the word for each braid into registers. */
     966                  word0 = crc0 ^ words[0];
     967  #if N > 1
     968                  word1 = crc1 ^ words[1];
     969  #if N > 2
     970                  word2 = crc2 ^ words[2];
     971  #if N > 3
     972                  word3 = crc3 ^ words[3];
     973  #if N > 4
     974                  word4 = crc4 ^ words[4];
     975  #if N > 5
     976                  word5 = crc5 ^ words[5];
     977  #endif
     978  #endif
     979  #endif
     980  #endif
     981  #endif
     982                  words += N;
     983  
     984                  /* Compute and update the CRC for each word. The loop should
     985                     get unrolled. */
     986                  crc0 = crc_braid_big_table[0][word0 & 0xff];
     987  #if N > 1
     988                  crc1 = crc_braid_big_table[0][word1 & 0xff];
     989  #if N > 2
     990                  crc2 = crc_braid_big_table[0][word2 & 0xff];
     991  #if N > 3
     992                  crc3 = crc_braid_big_table[0][word3 & 0xff];
     993  #if N > 4
     994                  crc4 = crc_braid_big_table[0][word4 & 0xff];
     995  #if N > 5
     996                  crc5 = crc_braid_big_table[0][word5 & 0xff];
     997  #endif
     998  #endif
     999  #endif
    1000  #endif
    1001  #endif
    1002                  for (k = 1; k < W; k++) {
    1003                      crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
    1004  #if N > 1
    1005                      crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
    1006  #if N > 2
    1007                      crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
    1008  #if N > 3
    1009                      crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
    1010  #if N > 4
    1011                      crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
    1012  #if N > 5
    1013                      crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
    1014  #endif
    1015  #endif
    1016  #endif
    1017  #endif
    1018  #endif
    1019                  }
    1020              }
    1021  
    1022              /*
    1023                Process the last block, combining the CRCs of the N braids at the
    1024                same time.
    1025               */
    1026              comb = crc_word_big(crc0 ^ words[0]);
    1027  #if N > 1
    1028              comb = crc_word_big(crc1 ^ words[1] ^ comb);
    1029  #if N > 2
    1030              comb = crc_word_big(crc2 ^ words[2] ^ comb);
    1031  #if N > 3
    1032              comb = crc_word_big(crc3 ^ words[3] ^ comb);
    1033  #if N > 4
    1034              comb = crc_word_big(crc4 ^ words[4] ^ comb);
    1035  #if N > 5
    1036              comb = crc_word_big(crc5 ^ words[5] ^ comb);
    1037  #endif
    1038  #endif
    1039  #endif
    1040  #endif
    1041  #endif
    1042              words += N;
    1043              crc = byte_swap(comb);
    1044          }
    1045  
    1046          /*
    1047            Update the pointer to the remaining bytes to process.
    1048           */
    1049          buf = (unsigned char const *)words;
    1050      }
    1051  
    1052  #endif /* W */
    1053  
    1054      /* Complete the computation of the CRC on any remaining bytes. */
    1055      while (len >= 8) {
    1056          len -= 8;
    1057          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1058          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1059          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1060          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1061          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1062          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1063          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1064          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1065      }
    1066      while (len) {
    1067          len--;
    1068          crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
    1069      }
    1070  
    1071      /* Return the CRC, post-conditioned. */
    1072      return crc ^ 0xffffffff;
    1073  }
    1074  
    1075  #endif
    1076  
    1077  /* ========================================================================= */
    1078  unsigned long ZEXPORT crc32(
    1079      unsigned long crc,
    1080      const unsigned char FAR *buf,
    1081      uInt len)
    1082  {
    1083      return crc32_z(crc, buf, len);
    1084  }
    1085  
    1086  #ifndef Z_FREETYPE
    1087  
    1088  /* ========================================================================= */
    1089  uLong ZEXPORT crc32_combine64(
    1090      uLong crc1,
    1091      uLong crc2,
    1092      z_off64_t len2)
    1093  {
    1094  #ifdef DYNAMIC_CRC_TABLE
    1095      once(&made, make_crc_table);
    1096  #endif /* DYNAMIC_CRC_TABLE */
    1097      return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
    1098  }
    1099  
    1100  /* ========================================================================= */
    1101  uLong ZEXPORT crc32_combine(
    1102      uLong crc1,
    1103      uLong crc2,
    1104      z_off_t len2)
    1105  {
    1106      return crc32_combine64(crc1, crc2, (z_off64_t)len2);
    1107  }
    1108  
    1109  /* ========================================================================= */
    1110  uLong ZEXPORT crc32_combine_gen64(
    1111      z_off64_t len2)
    1112  {
    1113  #ifdef DYNAMIC_CRC_TABLE
    1114      once(&made, make_crc_table);
    1115  #endif /* DYNAMIC_CRC_TABLE */
    1116      return x2nmodp(len2, 3);
    1117  }
    1118  
    1119  /* ========================================================================= */
    1120  uLong ZEXPORT crc32_combine_gen(
    1121      z_off_t len2)
    1122  {
    1123      return crc32_combine_gen64((z_off64_t)len2);
    1124  }
    1125  
    1126  /* ========================================================================= */
    1127  uLong ZEXPORT crc32_combine_op(
    1128      uLong crc1,
    1129      uLong crc2,
    1130      uLong op)
    1131  {
    1132      return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
    1133  }
    1134  
    1135  #endif  /* Z_FREETYPE */