(root)/
Python-3.12.0/
Python/
pyhash.c
       1  /* Set of hash utility functions to help maintaining the invariant that
       2      if a==b then hash(a)==hash(b)
       3  
       4     All the utility functions (_Py_Hash*()) return "-1" to signify an error.
       5  */
       6  #include "Python.h"
       7  
       8  #ifdef __APPLE__
       9  #  include <libkern/OSByteOrder.h>
      10  #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
      11  #  include <endian.h>
      12  #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
      13  #  include <sys/endian.h>
      14  #endif
      15  
      16  #ifdef __cplusplus
      17  extern "C" {
      18  #endif
      19  
      20  _Py_HashSecret_t _Py_HashSecret = {{0}};
      21  
      22  #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
      23  extern PyHash_FuncDef PyHash_Func;
      24  #else
      25  static PyHash_FuncDef PyHash_Func;
      26  #endif
      27  
      28  /* Count _Py_HashBytes() calls */
      29  #ifdef Py_HASH_STATS
      30  #define Py_HASH_STATS_MAX 32
      31  static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
      32  #endif
      33  
      34  /* For numeric types, the hash of a number x is based on the reduction
      35     of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
      36     hash(x) == hash(y) whenever x and y are numerically equal, even if
      37     x and y have different types.
      38  
      39     A quick summary of the hashing strategy:
      40  
      41     (1) First define the 'reduction of x modulo P' for any rational
      42     number x; this is a standard extension of the usual notion of
      43     reduction modulo P for integers.  If x == p/q (written in lowest
      44     terms), the reduction is interpreted as the reduction of p times
      45     the inverse of the reduction of q, all modulo P; if q is exactly
      46     divisible by P then define the reduction to be infinity.  So we've
      47     got a well-defined map
      48  
      49        reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
      50  
      51     (2) Now for a rational number x, define hash(x) by:
      52  
      53        reduce(x)   if x >= 0
      54        -reduce(-x) if x < 0
      55  
      56     If the result of the reduction is infinity (this is impossible for
      57     integers, floats and Decimals) then use the predefined hash value
      58     _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
      59     _PyHASH_INF and -_PyHASH_INF are also used for the
      60     hashes of float and Decimal infinities.
      61  
      62     NaNs hash with a pointer hash.  Having distinct hash values prevents
      63     catastrophic pileups from distinct NaN instances which used to always
      64     have the same hash value but would compare unequal.
      65  
      66     A selling point for the above strategy is that it makes it possible
      67     to compute hashes of decimal and binary floating-point numbers
      68     efficiently, even if the exponent of the binary or decimal number
      69     is large.  The key point is that
      70  
      71        reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
      72  
      73     provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
      74     binary or decimal float is never infinity, since the denominator is a power
      75     of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
      76     for nonnegative x,
      77  
      78        reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
      79  
      80        reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
      81  
      82     and reduce(10**e) can be computed efficiently by the usual modular
      83     exponentiation algorithm.  For reduce(2**e) it's even better: since
      84     P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
      85     by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
      86  
      87     */
      88  
      89  Py_hash_t _Py_HashPointer(const void *);
      90  
      91  Py_hash_t
      92  _Py_HashDouble(PyObject *inst, double v)
      93  {
      94      int e, sign;
      95      double m;
      96      Py_uhash_t x, y;
      97  
      98      if (!Py_IS_FINITE(v)) {
      99          if (Py_IS_INFINITY(v))
     100              return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
     101          else
     102              return _Py_HashPointer(inst);
     103      }
     104  
     105      m = frexp(v, &e);
     106  
     107      sign = 1;
     108      if (m < 0) {
     109          sign = -1;
     110          m = -m;
     111      }
     112  
     113      /* process 28 bits at a time;  this should work well both for binary
     114         and hexadecimal floating point. */
     115      x = 0;
     116      while (m) {
     117          x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
     118          m *= 268435456.0;  /* 2**28 */
     119          e -= 28;
     120          y = (Py_uhash_t)m;  /* pull out integer part */
     121          m -= y;
     122          x += y;
     123          if (x >= _PyHASH_MODULUS)
     124              x -= _PyHASH_MODULUS;
     125      }
     126  
     127      /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
     128      e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
     129      x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
     130  
     131      x = x * sign;
     132      if (x == (Py_uhash_t)-1)
     133          x = (Py_uhash_t)-2;
     134      return (Py_hash_t)x;
     135  }
     136  
     137  Py_hash_t
     138  _Py_HashPointerRaw(const void *p)
     139  {
     140      size_t y = (size_t)p;
     141      /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
     142         excessive hash collisions for dicts and sets */
     143      y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
     144      return (Py_hash_t)y;
     145  }
     146  
     147  Py_hash_t
     148  _Py_HashPointer(const void *p)
     149  {
     150      Py_hash_t x = _Py_HashPointerRaw(p);
     151      if (x == -1) {
     152          x = -2;
     153      }
     154      return x;
     155  }
     156  
     157  Py_hash_t
     158  _Py_HashBytes(const void *src, Py_ssize_t len)
     159  {
     160      Py_hash_t x;
     161      /*
     162        We make the hash of the empty string be 0, rather than using
     163        (prefix ^ suffix), since this slightly obfuscates the hash secret
     164      */
     165      if (len == 0) {
     166          return 0;
     167      }
     168  
     169  #ifdef Py_HASH_STATS
     170      hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
     171  #endif
     172  
     173  #if Py_HASH_CUTOFF > 0
     174      if (len < Py_HASH_CUTOFF) {
     175          /* Optimize hashing of very small strings with inline DJBX33A. */
     176          Py_uhash_t hash;
     177          const unsigned char *p = src;
     178          hash = 5381; /* DJBX33A starts with 5381 */
     179  
     180          switch(len) {
     181              /* ((hash << 5) + hash) + *p == hash * 33 + *p */
     182              case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
     183              case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
     184              case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
     185              case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
     186              case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
     187              case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
     188              case 1: hash = ((hash << 5) + hash) + *p++; break;
     189              default:
     190                  Py_UNREACHABLE();
     191          }
     192          hash ^= len;
     193          hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
     194          x = (Py_hash_t)hash;
     195      }
     196      else
     197  #endif /* Py_HASH_CUTOFF */
     198          x = PyHash_Func.hash(src, len);
     199  
     200      if (x == -1)
     201          return -2;
     202      return x;
     203  }
     204  
     205  void
     206  _PyHash_Fini(void)
     207  {
     208  #ifdef Py_HASH_STATS
     209      fprintf(stderr, "len   calls    total\n");
     210      Py_ssize_t total = 0;
     211      for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
     212          total += hashstats[i];
     213          fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
     214      }
     215      total += hashstats[0];
     216      fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
     217  #endif
     218  }
     219  
     220  PyHash_FuncDef *
     221  PyHash_GetFuncDef(void)
     222  {
     223      return &PyHash_Func;
     224  }
     225  
     226  /* Optimized memcpy() for Windows */
     227  #ifdef _MSC_VER
     228  #  if SIZEOF_PY_UHASH_T == 4
     229  #    define PY_UHASH_CPY(dst, src) do {                                    \
     230         dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
     231         } while(0)
     232  #  elif SIZEOF_PY_UHASH_T == 8
     233  #    define PY_UHASH_CPY(dst, src) do {                                    \
     234         dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
     235         dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
     236         } while(0)
     237  #  else
     238  #    error SIZEOF_PY_UHASH_T must be 4 or 8
     239  #  endif /* SIZEOF_PY_UHASH_T */
     240  #else /* not Windows */
     241  #  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
     242  #endif /* _MSC_VER */
     243  
     244  
     245  #if Py_HASH_ALGORITHM == Py_HASH_FNV
     246  /* **************************************************************************
     247   * Modified Fowler-Noll-Vo (FNV) hash function
     248   */
     249  static Py_hash_t
     250  fnv(const void *src, Py_ssize_t len)
     251  {
     252      const unsigned char *p = src;
     253      Py_uhash_t x;
     254      Py_ssize_t remainder, blocks;
     255      union {
     256          Py_uhash_t value;
     257          unsigned char bytes[SIZEOF_PY_UHASH_T];
     258      } block;
     259  
     260  #ifdef Py_DEBUG
     261      assert(_Py_HashSecret_Initialized);
     262  #endif
     263      remainder = len % SIZEOF_PY_UHASH_T;
     264      if (remainder == 0) {
     265          /* Process at least one block byte by byte to reduce hash collisions
     266           * for strings with common prefixes. */
     267          remainder = SIZEOF_PY_UHASH_T;
     268      }
     269      blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
     270  
     271      x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
     272      x ^= (Py_uhash_t) *p << 7;
     273      while (blocks--) {
     274          PY_UHASH_CPY(block.bytes, p);
     275          x = (_PyHASH_MULTIPLIER * x) ^ block.value;
     276          p += SIZEOF_PY_UHASH_T;
     277      }
     278      /* add remainder */
     279      for (; remainder > 0; remainder--)
     280          x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
     281      x ^= (Py_uhash_t) len;
     282      x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
     283      if (x == (Py_uhash_t) -1) {
     284          x = (Py_uhash_t) -2;
     285      }
     286      return x;
     287  }
     288  
     289  static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
     290                                       16 * SIZEOF_PY_HASH_T};
     291  
     292  #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
     293  
     294  
     295  /* **************************************************************************
     296   <MIT License>
     297   Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
     298  
     299   Permission is hereby granted, free of charge, to any person obtaining a copy
     300   of this software and associated documentation files (the "Software"), to deal
     301   in the Software without restriction, including without limitation the rights
     302   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     303   copies of the Software, and to permit persons to whom the Software is
     304   furnished to do so, subject to the following conditions:
     305  
     306   The above copyright notice and this permission notice shall be included in
     307   all copies or substantial portions of the Software.
     308  
     309   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     310   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     311   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     312   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     313   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     314   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     315   THE SOFTWARE.
     316   </MIT License>
     317  
     318   Original location:
     319      https://github.com/majek/csiphash/
     320  
     321   Solution inspired by code from:
     322      Samuel Neves (supercop/crypto_auth/siphash24/little)
     323      djb (supercop/crypto_auth/siphash24/little2)
     324      Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
     325  
     326   Modified for Python by Christian Heimes:
     327      - C89 / MSVC compatibility
     328      - _rotl64() on Windows
     329      - letoh64() fallback
     330  */
     331  
     332  /* byte swap little endian to host endian
     333   * Endian conversion not only ensures that the hash function returns the same
     334   * value on all platforms. It is also required to for a good dispersion of
     335   * the hash values' least significant bits.
     336   */
     337  #if PY_LITTLE_ENDIAN
     338  #  define _le64toh(x) ((uint64_t)(x))
     339  #elif defined(__APPLE__)
     340  #  define _le64toh(x) OSSwapLittleToHostInt64(x)
     341  #elif defined(HAVE_LETOH64)
     342  #  define _le64toh(x) le64toh(x)
     343  #else
     344  #  define _le64toh(x) (((uint64_t)(x) << 56) | \
     345                        (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
     346                        (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
     347                        (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
     348                        (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
     349                        (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
     350                        (((uint64_t)(x) >> 40) & 0xff00ULL) | \
     351                        ((uint64_t)(x)  >> 56))
     352  #endif
     353  
     354  
     355  #ifdef _MSC_VER
     356  #  define ROTATE(x, b)  _rotl64(x, b)
     357  #else
     358  #  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
     359  #endif
     360  
     361  #define HALF_ROUND(a,b,c,d,s,t)     \
     362      a += b; c += d;                 \
     363      b = ROTATE(b, s) ^ a;           \
     364      d = ROTATE(d, t) ^ c;           \
     365      a = ROTATE(a, 32);
     366  
     367  #define SINGLE_ROUND(v0,v1,v2,v3)   \
     368      HALF_ROUND(v0,v1,v2,v3,13,16);  \
     369      HALF_ROUND(v2,v1,v0,v3,17,21);
     370  
     371  #define DOUBLE_ROUND(v0,v1,v2,v3)   \
     372      SINGLE_ROUND(v0,v1,v2,v3);      \
     373      SINGLE_ROUND(v0,v1,v2,v3);
     374  
     375  
     376  static uint64_t
     377  siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
     378      uint64_t b = (uint64_t)src_sz << 56;
     379      const uint8_t *in = (const uint8_t*)src;
     380  
     381      uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
     382      uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
     383      uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
     384      uint64_t v3 = k1 ^ 0x7465646279746573ULL;
     385  
     386      uint64_t t;
     387      uint8_t *pt;
     388  
     389      while (src_sz >= 8) {
     390          uint64_t mi;
     391          memcpy(&mi, in, sizeof(mi));
     392          mi = _le64toh(mi);
     393          in += sizeof(mi);
     394          src_sz -= sizeof(mi);
     395          v3 ^= mi;
     396          SINGLE_ROUND(v0,v1,v2,v3);
     397          v0 ^= mi;
     398      }
     399  
     400      t = 0;
     401      pt = (uint8_t *)&t;
     402      switch (src_sz) {
     403          case 7: pt[6] = in[6]; /* fall through */
     404          case 6: pt[5] = in[5]; /* fall through */
     405          case 5: pt[4] = in[4]; /* fall through */
     406          case 4: memcpy(pt, in, sizeof(uint32_t)); break;
     407          case 3: pt[2] = in[2]; /* fall through */
     408          case 2: pt[1] = in[1]; /* fall through */
     409          case 1: pt[0] = in[0]; /* fall through */
     410      }
     411      b |= _le64toh(t);
     412  
     413      v3 ^= b;
     414      SINGLE_ROUND(v0,v1,v2,v3);
     415      v0 ^= b;
     416      v2 ^= 0xff;
     417      SINGLE_ROUND(v0,v1,v2,v3);
     418      SINGLE_ROUND(v0,v1,v2,v3);
     419      SINGLE_ROUND(v0,v1,v2,v3);
     420  
     421      /* modified */
     422      t = (v0 ^ v1) ^ (v2 ^ v3);
     423      return t;
     424  }
     425  
     426  #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
     427  static uint64_t
     428  siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
     429      uint64_t b = (uint64_t)src_sz << 56;
     430      const uint8_t *in = (const uint8_t*)src;
     431  
     432      uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
     433      uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
     434      uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
     435      uint64_t v3 = k1 ^ 0x7465646279746573ULL;
     436  
     437      uint64_t t;
     438      uint8_t *pt;
     439  
     440      while (src_sz >= 8) {
     441          uint64_t mi;
     442          memcpy(&mi, in, sizeof(mi));
     443          mi = _le64toh(mi);
     444          in += sizeof(mi);
     445          src_sz -= sizeof(mi);
     446          v3 ^= mi;
     447          DOUBLE_ROUND(v0,v1,v2,v3);
     448          v0 ^= mi;
     449      }
     450  
     451      t = 0;
     452      pt = (uint8_t *)&t;
     453      switch (src_sz) {
     454          case 7: pt[6] = in[6]; /* fall through */
     455          case 6: pt[5] = in[5]; /* fall through */
     456          case 5: pt[4] = in[4]; /* fall through */
     457          case 4: memcpy(pt, in, sizeof(uint32_t)); break;
     458          case 3: pt[2] = in[2]; /* fall through */
     459          case 2: pt[1] = in[1]; /* fall through */
     460          case 1: pt[0] = in[0]; /* fall through */
     461      }
     462      b |= _le64toh(t);
     463  
     464      v3 ^= b;
     465      DOUBLE_ROUND(v0,v1,v2,v3);
     466      v0 ^= b;
     467      v2 ^= 0xff;
     468      DOUBLE_ROUND(v0,v1,v2,v3);
     469      DOUBLE_ROUND(v0,v1,v2,v3);
     470  
     471      /* modified */
     472      t = (v0 ^ v1) ^ (v2 ^ v3);
     473      return t;
     474  }
     475  #endif
     476  
     477  uint64_t
     478  _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
     479  {
     480      return siphash13(key, 0, src, src_sz);
     481  }
     482  
     483  
     484  #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
     485  static Py_hash_t
     486  pysiphash(const void *src, Py_ssize_t src_sz) {
     487      return (Py_hash_t)siphash13(
     488          _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
     489          src, src_sz);
     490  }
     491  
     492  static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
     493  #endif
     494  
     495  #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
     496  static Py_hash_t
     497  pysiphash(const void *src, Py_ssize_t src_sz) {
     498      return (Py_hash_t)siphash24(
     499          _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
     500          src, src_sz);
     501  }
     502  
     503  static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
     504  #endif
     505  
     506  #ifdef __cplusplus
     507  }
     508  #endif