1  // Copyright (C) 2010-2023 Free Software Foundation, Inc.
       2  //
       3  // This file is part of the GNU ISO C++ Library.  This library is free
       4  // software; you can redistribute it and/or modify it under the
       5  // terms of the GNU General Public License as published by the
       6  // Free Software Foundation; either version 3, or (at your option)
       7  // any later version.
       8  //
       9  // This library is distributed in the hope that it will be useful,
      10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
      11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      12  // GNU General Public License for more details.
      13  //
      14  // You should have received a copy of the GNU General Public License
      15  // along with this library; see the file COPYING3.  If not see
      16  // <http://www.gnu.org/licenses/>.
      17  
      18  // This file uses the chi^2 test to measure the quality of a hash
      19  // function, by computing the uniformity with which it distributes a set
      20  // of N strings into k buckets (where k is significantly greater than N).
      21  //
      22  // Each bucket has B[i] strings in it. The expected value of each bucket
      23  // for a uniform distribution is z = N/k, so
      24  //   chi^2 = Sum_i (B[i] - z)^2 / z.
      25  //
      26  // We check whether chi^2 is small enough to be consistent with the
      27  // hypothesis of a uniform distribution. If F(chi^2, k-1) is close to
      28  // 0 (where F is the cumulative probability distribution), we can
      29  // reject that hypothesis. So we don't want F to be too small, which
      30  // for large k, means we want chi^2 to be not too much larger than k.
      31  //
      32  // We use the chi^2 test for several sets of strings. Any non-horrible
      33  // hash function should do well with purely random strings. A really
      34  // good hash function will also do well with more structured sets,
      35  // including ones where the strings differ by only a few bits.
      36  
      37  #include <algorithm>
      38  #include <cstdlib>
      39  #include <cstdio>
      40  #include <fstream>
      41  #include <functional>
      42  #include <iostream>
      43  #include <iterator>
      44  #include <string>
      45  #include <unordered_set>
      46  #include <vector>
      47  #include <testsuite_hooks.h>
      48  
      49  #ifndef SAMPLES
      50  #define SAMPLES 300000
      51  #endif
      52  
      53  template <typename Container>
      54    double
      55    chi2_hash(const Container& c, long buckets)
      56    {
      57      std::vector<int> counts(buckets);
      58      std::hash<std::string> hasher;
      59      double elements = 0;
      60      for (auto i = c.begin(); i != c.end(); ++i)
      61        {
      62          ++counts[hasher(*i) % buckets];
      63          ++elements;
      64        }
      65  
      66      const double z = elements / buckets;
      67      double sum = 0;
      68      for (long i = 0; i < buckets; ++i)
      69        {
      70          double delta = counts[i] - z;
      71          sum += delta*delta;
      72        }
      73      return sum/z;
      74    }