1  /* mpz_bin_uiui - compute n over k.
       2  
       3  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
       4  
       5  Copyright 2010-2012, 2015-2018, 2020 Free Software Foundation, Inc.
       6  
       7  This file is part of the GNU MP Library.
       8  
       9  The GNU MP Library is free software; you can redistribute it and/or modify
      10  it under the terms of either:
      11  
      12    * the GNU Lesser General Public License as published by the Free
      13      Software Foundation; either version 3 of the License, or (at your
      14      option) any later version.
      15  
      16  or
      17  
      18    * the GNU General Public License as published by the Free Software
      19      Foundation; either version 2 of the License, or (at your option) any
      20      later version.
      21  
      22  or both in parallel, as here.
      23  
      24  The GNU MP Library is distributed in the hope that it will be useful, but
      25  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      26  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      27  for more details.
      28  
      29  You should have received copies of the GNU General Public License and the
      30  GNU Lesser General Public License along with the GNU MP Library.  If not,
      31  see https://www.gnu.org/licenses/.  */
      32  
      33  #include "gmp-impl.h"
      34  #include "longlong.h"
      35  
      36  #ifndef BIN_GOETGHELUCK_THRESHOLD
      37  #define BIN_GOETGHELUCK_THRESHOLD  512
      38  #endif
      39  #ifndef BIN_UIUI_ENABLE_SMALLDC
      40  #define BIN_UIUI_ENABLE_SMALLDC    1
      41  #endif
      42  #ifndef BIN_UIUI_RECURSIVE_SMALLDC
      43  #define BIN_UIUI_RECURSIVE_SMALLDC (GMP_NUMB_BITS > 32)
      44  #endif
      45  
      46  /* Algorithm:
      47  
      48     Accumulate chunks of factors first limb-by-limb (using one of mul0-mul8)
      49     which are then accumulated into mpn numbers.  The first inner loop
      50     accumulates divisor factors, the 2nd inner loop accumulates exactly the same
      51     number of dividend factors.  We avoid accumulating more for the divisor,
      52     even with its smaller factors, since we else cannot guarantee divisibility.
      53  
      54     Since we know each division will yield an integer, we compute the quotient
      55     using Hensel norm: If the quotient is limited by 2^t, we compute A / B mod
      56     2^t.
      57  
      58     Improvements:
      59  
      60     (1) An obvious improvement to this code would be to compute mod 2^t
      61     everywhere.  Unfortunately, we cannot determine t beforehand, unless we
      62     invoke some approximation, such as Stirling's formula.  Of course, we don't
      63     need t to be tight.  However, it is not clear that this would help much,
      64     our numbers are kept reasonably small already.
      65  
      66     (2) Compute nmax/kmax semi-accurately, without scalar division or a loop.
      67     Extracting the 3 msb, then doing a table lookup using cnt*8+msb as index,
      68     would make it both reasonably accurate and fast.  (We could use a table
      69     stored into a limb, perhaps.)  The table should take the removed factors of
      70     2 into account (those done on-the-fly in mulN).
      71  
      72     (3) The first time in the loop we compute the odd part of a
      73     factorial in kp, we might use oddfac_1 for this task.
      74   */
      75  
      76  /* This threshold determines how large divisor to accumulate before we call
      77     bdiv.  Perhaps we should never call bdiv, and accumulate all we are told,
      78     since we are just basecase code anyway?  Presumably, this depends on the
      79     relative speed of the asymptotically fast code and this code.  */
      80  #define SOME_THRESHOLD 20
      81  
      82  /* Multiply-into-limb functions.  These remove factors of 2 on-the-fly.  FIXME:
      83     All versions of MAXFACS don't take this 2 removal into account now, meaning
      84     that then, shifting just adds some overhead.  (We remove factors from the
      85     completed limb anyway.)  */
      86  
      87  static mp_limb_t
      88  mul1 (mp_limb_t m)
      89  {
      90    return m;
      91  }
      92  
      93  static mp_limb_t
      94  mul2 (mp_limb_t m)
      95  {
      96    /* We need to shift before multiplying, to avoid an overflow. */
      97    mp_limb_t m01 = (m | 1) * ((m + 1) >> 1);
      98    return m01;
      99  }
     100  
     101  static mp_limb_t
     102  mul3 (mp_limb_t m)
     103  {
     104    mp_limb_t m01 = (m + 0) * (m + 1) >> 1;
     105    mp_limb_t m2 = (m + 2);
     106    return m01 * m2;
     107  }
     108  
     109  static mp_limb_t
     110  mul4 (mp_limb_t m)
     111  {
     112    mp_limb_t m03 = (m + 0) * (m + 3) >> 1;
     113    return m03 * (m03 + 1); /* mul2 (m03) ? */
     114  }
     115  
     116  static mp_limb_t
     117  mul5 (mp_limb_t m)
     118  {
     119    mp_limb_t m03 = (m + 0) * (m + 3) >> 1;
     120    mp_limb_t m034 = m03 * (m + 4);
     121    return (m03 + 1) * m034;
     122  }
     123  
     124  static mp_limb_t
     125  mul6 (mp_limb_t m)
     126  {
     127    mp_limb_t m05 = (m + 0) * (m + 5);
     128    mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;
     129    return m1234 * (m05 >> 1);
     130  }
     131  
     132  static mp_limb_t
     133  mul7 (mp_limb_t m)
     134  {
     135    mp_limb_t m05 = (m + 0) * (m + 5);
     136    mp_limb_t m1234 = (m05 + 5) * (m05 + 5) >> 3;
     137    mp_limb_t m056 = m05 * (m + 6) >> 1;
     138    return m1234 * m056;
     139  }
     140  
     141  static mp_limb_t
     142  mul8 (mp_limb_t m)
     143  {
     144    mp_limb_t m07 = (m + 0) * (m + 7);
     145    mp_limb_t m0257 = m07 * (m07 + 10) >> 3;
     146    mp_limb_t m1346 = m07 + 9 + m0257;
     147    return m0257 * m1346;
     148  }
     149  
     150  /*
     151  static mp_limb_t
     152  mul9 (mp_limb_t m)
     153  {
     154    return (m + 8) * mul8 (m) ;
     155  }
     156  
     157  static mp_limb_t
     158  mul10 (mp_limb_t m)
     159  {
     160    mp_limb_t m09 = (m + 0) * (m + 9);
     161    mp_limb_t m18 = (m09 >> 1) + 4;
     162    mp_limb_t m0369 = m09 * (m09 + 18) >> 3;
     163    mp_limb_t m2457 = m09 * 2 + 35 + m0369;
     164    return ((m0369 * m2457) >> 1) * m18;
     165  }
     166  */
     167  
     168  typedef mp_limb_t (* mulfunc_t) (mp_limb_t);
     169  
     170  static const mulfunc_t mulfunc[] = {mul1,mul2,mul3,mul4,mul5,mul6,mul7,mul8 /* ,mul9,mul10 */};
     171  #define M (numberof(mulfunc))
     172  
     173  /* Number of factors-of-2 removed by the corresponding mulN function.  */
     174  static const unsigned char tcnttab[] = {0, 1, 1, 2, 2, 4, 4, 6 /*,6,8*/};
     175  
     176  #if 1
     177  /* This variant is inaccurate but share the code with other functions.  */
     178  #define MAXFACS(max,l)							\
     179    do {									\
     180      (max) = log_n_max (l);						\
     181    } while (0)
     182  #else
     183  
     184  /* This variant is exact(?) but uses a loop.  It takes the 2 removal
     185   of mulN into account.  */
     186  static const unsigned long ftab[] =
     187  #if GMP_NUMB_BITS == 64
     188    /* 1 to 8 factors per iteration */
     189    {CNST_LIMB(0xffffffffffffffff),CNST_LIMB(0x16a09e667),0x32cbfc,0x16a08,0x24c0,0xa11,0x345,0x1ab /*,0xe9,0x8e */};
     190  #endif
     191  #if GMP_NUMB_BITS == 32
     192    /* 1 to 7 factors per iteration */
     193    {0xffffffff,0x16a09,0x7ff,0x168,0x6f,0x3d,0x20 /* ,0x17 */};
     194  #endif
     195  
     196  #define MAXFACS(max,l)							\
     197    do {									\
     198      int __i;								\
     199      for (__i = numberof (ftab) - 1; l > ftab[__i]; __i--)		\
     200        ;									\
     201      (max) = __i + 1;							\
     202    } while (0)
     203  #endif
     204  
     205  /* Entry i contains (i!/2^t)^(-1) where t is chosen such that the parenthesis
     206     is an odd integer. */
     207  static const mp_limb_t facinv[] = { ONE_LIMB_ODD_FACTORIAL_INVERSES_TABLE };
     208  
     209  static void
     210  mpz_bdiv_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
     211  {
     212    unsigned nmax, kmax, nmaxnow, numfac;
     213    mp_ptr np, kp;
     214    mp_size_t nn, kn, alloc;
     215    mp_limb_t i, j, t, iii, jjj, cy, dinv;
     216    int cnt;
     217    mp_size_t maxn;
     218    TMP_DECL;
     219  
     220    ASSERT (k > ODD_FACTORIAL_TABLE_LIMIT);
     221    TMP_MARK;
     222  
     223    maxn = 1 + n / GMP_NUMB_BITS;    /* absolutely largest result size (limbs) */
     224  
     225    /* FIXME: This allocation might be insufficient, but is usually way too
     226       large.  */
     227    alloc = SOME_THRESHOLD - 1 + MAX (3 * maxn / 2, SOME_THRESHOLD);
     228    alloc = MIN (alloc, (mp_size_t) k) + 1;
     229    TMP_ALLOC_LIMBS_2 (np, alloc, kp, SOME_THRESHOLD + 1);
     230  
     231    MAXFACS (nmax, n);
     232    ASSERT (nmax <= M);
     233    MAXFACS (kmax, k);
     234    ASSERT (kmax <= M);
     235    ASSERT (k >= M);
     236  
     237    i = n - k + 1;
     238  
     239    np[0] = 1; nn = 1;
     240  
     241    numfac = 1;
     242    j = ODD_FACTORIAL_TABLE_LIMIT + 1;
     243    jjj = ODD_FACTORIAL_TABLE_MAX;
     244    ASSERT (__gmp_oddfac_table[ODD_FACTORIAL_TABLE_LIMIT] == ODD_FACTORIAL_TABLE_MAX);
     245  
     246    while (1)
     247      {
     248        kp[0] = jjj;				/* store new factors */
     249        kn = 1;
     250        t = k - j + 1;
     251        kmax = MIN (kmax, t);
     252  
     253        while (kmax != 0 && kn < SOME_THRESHOLD)
     254  	{
     255  	  jjj = mulfunc[kmax - 1] (j);
     256  	  j += kmax;				/* number of factors used */
     257  	  count_trailing_zeros (cnt, jjj);	/* count low zeros */
     258  	  jjj >>= cnt;				/* remove remaining low zeros */
     259  	  cy = mpn_mul_1 (kp, kp, kn, jjj);	/* accumulate new factors */
     260  	  kp[kn] = cy;
     261  	  kn += cy != 0;
     262  	  t = k - j + 1;
     263  	  kmax = MIN (kmax, t);
     264  	}
     265        numfac = j - numfac;
     266  
     267        while (numfac != 0)
     268  	{
     269  	  nmaxnow = MIN (nmax, numfac);
     270  	  iii = mulfunc[nmaxnow - 1] (i);
     271  	  i += nmaxnow;				/* number of factors used */
     272  	  count_trailing_zeros (cnt, iii);	/* count low zeros */
     273  	  iii >>= cnt;				/* remove remaining low zeros */
     274  	  cy = mpn_mul_1 (np, np, nn, iii);	/* accumulate new factors */
     275  	  np[nn] = cy;
     276  	  nn += cy != 0;
     277  	  numfac -= nmaxnow;
     278  	}
     279  
     280        ASSERT (nn < alloc);
     281  
     282        binvert_limb (dinv, kp[0]);
     283        nn += (np[nn - 1] >= kp[kn - 1]);
     284        nn -= kn;
     285        mpn_sbpi1_bdiv_q (np, np, nn, kp, MIN(kn,nn), -dinv);
     286        mpn_neg (np, np, nn);
     287  
     288        if (kmax == 0)
     289  	break;
     290        numfac = j;
     291  
     292        jjj = mulfunc[kmax - 1] (j);
     293        j += kmax;				/* number of factors used */
     294        count_trailing_zeros (cnt, jjj);		/* count low zeros */
     295        jjj >>= cnt;				/* remove remaining low zeros */
     296      }
     297  
     298    /* Put back the right number of factors of 2.  */
     299    popc_limb (cnt, n - k);
     300    popc_limb (j, k);
     301    cnt += j;
     302    popc_limb (j, n);
     303    cnt -= j;
     304    if (cnt != 0)
     305      {
     306        ASSERT (cnt < GMP_NUMB_BITS); /* can happen, but not for intended use */
     307        cy = mpn_lshift (np, np, nn, cnt);
     308        np[nn] = cy;
     309        nn += cy != 0;
     310      }
     311  
     312    nn -= np[nn - 1] == 0;	/* normalisation */
     313  
     314    kp = MPZ_NEWALLOC (r, nn);
     315    SIZ(r) = nn;
     316    MPN_COPY (kp, np, nn);
     317    TMP_FREE;
     318  }
     319  
     320  static void
     321  mpz_smallk_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
     322  {
     323    unsigned nmax, numfac;
     324    mp_ptr rp;
     325    mp_size_t rn, alloc;
     326    mp_limb_t i, iii, cy;
     327    unsigned i2cnt, cnt;
     328  
     329    MAXFACS (nmax, n);
     330    nmax = MIN (nmax, M);
     331  
     332    i = n - k + 1;
     333  
     334    i2cnt = __gmp_fac2cnt_table[k / 2 - 1];		/* low zeros count */
     335    if (nmax >= k)
     336      {
     337        MPZ_NEWALLOC (r, 1) [0] = mulfunc[k - 1] (i) * facinv[k - 2] >>
     338  	(i2cnt - tcnttab[k - 1]);
     339        SIZ(r) = 1;
     340        return;
     341      }
     342  
     343    count_leading_zeros (cnt, (mp_limb_t) n);
     344    cnt = GMP_LIMB_BITS - cnt;
     345    alloc = cnt * k / GMP_NUMB_BITS + 3;	/* FIXME: ensure rounding is enough. */
     346    rp = MPZ_NEWALLOC (r, alloc);
     347  
     348    rp[0] = mulfunc[nmax - 1] (i);
     349    rn = 1;
     350    i += nmax;				/* number of factors used */
     351    i2cnt -= tcnttab[nmax - 1];		/* low zeros count */
     352    numfac = k - nmax;
     353    do
     354      {
     355        nmax = MIN (nmax, numfac);
     356        iii = mulfunc[nmax - 1] (i);
     357        i += nmax;			/* number of factors used */
     358        i2cnt -= tcnttab[nmax - 1];	/* update low zeros count */
     359        cy = mpn_mul_1 (rp, rp, rn, iii);	/* accumulate new factors */
     360        rp[rn] = cy;
     361        rn += cy != 0;
     362        numfac -= nmax;
     363      } while (numfac != 0);
     364  
     365    ASSERT (rn < alloc);
     366  
     367    mpn_pi1_bdiv_q_1 (rp, rp, rn, __gmp_oddfac_table[k], facinv[k - 2], i2cnt);
     368    /* A two-fold, branch-free normalisation is possible :*/
     369    /* rn -= rp[rn - 1] == 0; */
     370    /* rn -= rp[rn - 1] == 0; */
     371    MPN_NORMALIZE_NOT_ZERO (rp, rn);
     372  
     373    SIZ(r) = rn;
     374  }
     375  
     376  /* Algorithm:
     377  
     378     Plain and simply multiply things together.
     379  
     380     We tabulate factorials (k!/2^t)^(-1) mod B (where t is chosen such
     381     that k!/2^t is odd).
     382  
     383  */
     384  
     385  static mp_limb_t
     386  bc_bin_uiui (unsigned int n, unsigned int k)
     387  {
     388    return ((__gmp_oddfac_table[n] * facinv[k - 2] * facinv[n - k - 2])
     389      << (__gmp_fac2cnt_table[n / 2 - 1] - __gmp_fac2cnt_table[k / 2 - 1] - __gmp_fac2cnt_table[(n-k) / 2 - 1]))
     390      & GMP_NUMB_MASK;
     391  }
     392  
     393  /* Algorithm:
     394  
     395     Recursively exploit the relation
     396     bin(n,k) = bin(n,k>>1)*bin(n-k>>1,k-k>>1)/bin(k,k>>1) .
     397  
     398     Values for binomial(k,k>>1) that fit in a limb are precomputed
     399     (with inverses).
     400  */
     401  
     402  /* bin2kk[i - ODD_CENTRAL_BINOMIAL_OFFSET] =
     403     binomial(i*2,i)/2^t (where t is chosen so that it is odd). */
     404  static const mp_limb_t bin2kk[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_TABLE };
     405  
     406  /* bin2kkinv[i] = bin2kk[i]^-1 mod B */
     407  static const mp_limb_t bin2kkinv[] = { ONE_LIMB_ODD_CENTRAL_BINOMIAL_INVERSE_TABLE };
     408  
     409  /* bin2kk[i] = binomial((i+MIN_S)*2,i+MIN_S)/2^t. This table contains the t values. */
     410  static const unsigned char fac2bin[] = { CENTRAL_BINOMIAL_2FAC_TABLE };
     411  
     412  static void
     413  mpz_smallkdc_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
     414  {
     415    mp_ptr rp;
     416    mp_size_t rn;
     417    unsigned long int hk;
     418  
     419    hk = k >> 1;
     420  
     421    if ((! BIN_UIUI_RECURSIVE_SMALLDC) || hk <= ODD_FACTORIAL_TABLE_LIMIT)
     422      mpz_smallk_bin_uiui (r, n, hk);
     423    else
     424      mpz_smallkdc_bin_uiui (r, n, hk);
     425    k -= hk;
     426    n -= hk;
     427    if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) {
     428      mp_limb_t cy;
     429      rn = SIZ (r);
     430      rp = MPZ_REALLOC (r, rn + 1);
     431      cy = mpn_mul_1 (rp, rp, rn, bc_bin_uiui (n, k));
     432      rp [rn] = cy;
     433      rn += cy != 0;
     434    } else {
     435      mp_limb_t buffer[ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3];
     436      mpz_t t;
     437  
     438      ALLOC (t) = ODD_CENTRAL_BINOMIAL_TABLE_LIMIT + 3;
     439      PTR (t) = buffer;
     440      if ((! BIN_UIUI_RECURSIVE_SMALLDC) || k <= ODD_FACTORIAL_TABLE_LIMIT)
     441        mpz_smallk_bin_uiui (t, n, k);
     442      else
     443        mpz_smallkdc_bin_uiui (t, n, k);
     444      mpz_mul (r, r, t);
     445      rp = PTR (r);
     446      rn = SIZ (r);
     447    }
     448  
     449    mpn_pi1_bdiv_q_1 (rp, rp, rn, bin2kk[k - ODD_CENTRAL_BINOMIAL_OFFSET],
     450  		    bin2kkinv[k - ODD_CENTRAL_BINOMIAL_OFFSET],
     451  		    fac2bin[k - ODD_CENTRAL_BINOMIAL_OFFSET] - (k != hk));
     452    /* A two-fold, branch-free normalisation is possible :*/
     453    /* rn -= rp[rn - 1] == 0; */
     454    /* rn -= rp[rn - 1] == 0; */
     455    MPN_NORMALIZE_NOT_ZERO (rp, rn);
     456  
     457    SIZ(r) = rn;
     458  }
     459  
     460  /* mpz_goetgheluck_bin_uiui(RESULT, N, K) -- Set RESULT to binomial(N,K).
     461   *
     462   * Contributed to the GNU project by Marco Bodrato.
     463   *
     464   * Implementation of the algorithm by P. Goetgheluck, "Computing
     465   * Binomial Coefficients", The American Mathematical Monthly, Vol. 94,
     466   * No. 4 (April 1987), pp. 360-365.
     467   *
     468   * Acknowledgment: Peter Luschny did spot the slowness of the previous
     469   * code and suggested the reference.
     470   */
     471  
     472  /* TODO: Remove duplicated constants / macros / static functions...
     473   */
     474  
     475  /*************************************************************/
     476  /* Section macros: common macros, for swing/fac/bin (&sieve) */
     477  /*************************************************************/
     478  
     479  #define FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I)			\
     480    if ((PR) > (MAX_PR)) {					\
     481      (VEC)[(I)++] = (PR);					\
     482      (PR) = 1;							\
     483    }
     484  
     485  #define FACTOR_LIST_STORE(P, PR, MAX_PR, VEC, I)		\
     486    do {								\
     487      if ((PR) > (MAX_PR)) {					\
     488        (VEC)[(I)++] = (PR);					\
     489        (PR) = (P);						\
     490      } else							\
     491        (PR) *= (P);						\
     492    } while (0)
     493  
     494  #define LOOP_ON_SIEVE_CONTINUE(prime,end)			\
     495      __max_i = (end);						\
     496  								\
     497      do {							\
     498        ++__i;							\
     499        if ((*__sieve & __mask) == 0)				\
     500  	{							\
     501  	  mp_limb_t prime;					\
     502  	  prime = id_to_n(__i)
     503  
     504  #define LOOP_ON_SIEVE_BEGIN(prime,start,end,off,sieve)		\
     505    do {								\
     506      mp_limb_t __mask, *__sieve, __max_i, __i;			\
     507  								\
     508      __i = (start)-(off);					\
     509      __sieve = (sieve) + __i / GMP_LIMB_BITS;			\
     510      __mask = CNST_LIMB(1) << (__i % GMP_LIMB_BITS);		\
     511      __i += (off);						\
     512  								\
     513      LOOP_ON_SIEVE_CONTINUE(prime,end)
     514  
     515  #define LOOP_ON_SIEVE_STOP					\
     516  	}							\
     517        __mask = __mask << 1 | __mask >> (GMP_LIMB_BITS-1);	\
     518        __sieve += __mask & 1;					\
     519      }  while (__i <= __max_i)
     520  
     521  #define LOOP_ON_SIEVE_END					\
     522      LOOP_ON_SIEVE_STOP;						\
     523    } while (0)
     524  
     525  /*********************************************************/
     526  /* Section sieve: sieving functions and tools for primes */
     527  /*********************************************************/
     528  
     529  #if WANT_ASSERT
     530  static mp_limb_t
     531  bit_to_n (mp_limb_t bit) { return (bit*3+4)|1; }
     532  #endif
     533  
     534  /* id_to_n (x) = bit_to_n (x-1) = (id*3+1)|1*/
     535  static mp_limb_t
     536  id_to_n  (mp_limb_t id)  { return id*3+1+(id&1); }
     537  
     538  /* n_to_bit (n) = ((n-1)&(-CNST_LIMB(2)))/3U-1 */
     539  static mp_limb_t
     540  n_to_bit (mp_limb_t n) { return ((n-5)|1)/3U; }
     541  
     542  static mp_size_t
     543  primesieve_size (mp_limb_t n) { return n_to_bit(n) / GMP_LIMB_BITS + 1; }
     544  
     545  /*********************************************************/
     546  /* Section binomial: fast binomial implementation        */
     547  /*********************************************************/
     548  
     549  #define COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
     550    do {							\
     551      mp_limb_t __a, __b, __prime, __ma,__mb;		\
     552      __prime = (P);					\
     553      __a = (N); __b = (K); __mb = 0;			\
     554      FACTOR_LIST_APPEND(PR, MAX_PR, VEC, I);		\
     555      do {						\
     556        __mb += __b % __prime; __b /= __prime;		\
     557        __ma = __a % __prime; __a /= __prime;		\
     558        if (__ma < __mb) {				\
     559          __mb = 1; (PR) *= __prime;			\
     560        } else  __mb = 0;					\
     561      } while (__a >= __prime);				\
     562    } while (0)
     563  
     564  #define SH_COUNT_A_PRIME(P, N, K, PR, MAX_PR, VEC, I)	\
     565    do {							\
     566      mp_limb_t __prime;					\
     567      __prime = (P);					\
     568      if (((N) % __prime) < ((K) % __prime)) {		\
     569        FACTOR_LIST_STORE (__prime, PR, MAX_PR, VEC, I);	\
     570      }							\
     571    } while (0)
     572  
     573  /* Returns an approximation of the sqare root of x.
     574   * It gives:
     575   *   limb_apprsqrt (x) ^ 2 <= x < (limb_apprsqrt (x)+1) ^ 2
     576   * or
     577   *   x <= limb_apprsqrt (x) ^ 2 <= x * 9/8
     578   */
     579  static mp_limb_t
     580  limb_apprsqrt (mp_limb_t x)
     581  {
     582    int s;
     583  
     584    ASSERT (x > 2);
     585    count_leading_zeros (s, x);
     586    s = (GMP_LIMB_BITS - s) >> 1;
     587    return ((CNST_LIMB(1) << (s - 1)) + (x >> 1 >> s));
     588  }
     589  
     590  static void
     591  mpz_goetgheluck_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
     592  {
     593    mp_limb_t *sieve, *factors, count;
     594    mp_limb_t prod, max_prod;
     595    mp_size_t j;
     596    TMP_DECL;
     597  
     598    ASSERT (BIN_GOETGHELUCK_THRESHOLD >= 13);
     599    ASSERT (n >= 25);
     600  
     601    TMP_MARK;
     602    sieve = TMP_ALLOC_LIMBS (primesieve_size (n));
     603  
     604    count = gmp_primesieve (sieve, n) + 1;
     605    factors = TMP_ALLOC_LIMBS (count / log_n_max (n) + 1);
     606  
     607    max_prod = GMP_NUMB_MAX / n;
     608  
     609    /* Handle primes = 2, 3 separately. */
     610    popc_limb (count, n - k);
     611    popc_limb (j, k);
     612    count += j;
     613    popc_limb (j, n);
     614    count -= j;
     615    prod = CNST_LIMB(1) << count;
     616  
     617    j = 0;
     618    COUNT_A_PRIME (3, n, k, prod, max_prod, factors, j);
     619  
     620    /* Accumulate prime factors from 5 to n/2 */
     621      {
     622        mp_limb_t s;
     623  
     624        s = limb_apprsqrt(n);
     625        s = n_to_bit (s);
     626        ASSERT (bit_to_n (s+1) * bit_to_n (s+1) > n);
     627        ASSERT (s <= n_to_bit (n >> 1));
     628        LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (5), s, 0,sieve);
     629        COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
     630        LOOP_ON_SIEVE_STOP;
     631  
     632        ASSERT (max_prod <= GMP_NUMB_MAX / 2);
     633        max_prod <<= 1;
     634  
     635        LOOP_ON_SIEVE_CONTINUE (prime, n_to_bit (n >> 1));
     636        SH_COUNT_A_PRIME (prime, n, k, prod, max_prod, factors, j);
     637        LOOP_ON_SIEVE_END;
     638  
     639        max_prod >>= 1;
     640      }
     641  
     642    /* Store primes from (n-k)+1 to n */
     643    ASSERT (n_to_bit (n - k) < n_to_bit (n));
     644  
     645    LOOP_ON_SIEVE_BEGIN (prime, n_to_bit (n - k) + 1, n_to_bit (n), 0,sieve);
     646    FACTOR_LIST_STORE (prime, prod, max_prod, factors, j);
     647    LOOP_ON_SIEVE_END;
     648  
     649    if (LIKELY (j != 0))
     650      {
     651        factors[j++] = prod;
     652        mpz_prodlimbs (r, factors, j);
     653      }
     654    else
     655      {
     656        MPZ_NEWALLOC (r, 1)[0] = prod;
     657        SIZ (r) = 1;
     658      }
     659    TMP_FREE;
     660  }
     661  
     662  #undef COUNT_A_PRIME
     663  #undef SH_COUNT_A_PRIME
     664  #undef LOOP_ON_SIEVE_END
     665  #undef LOOP_ON_SIEVE_STOP
     666  #undef LOOP_ON_SIEVE_BEGIN
     667  #undef LOOP_ON_SIEVE_CONTINUE
     668  
     669  /*********************************************************/
     670  /* End of implementation of Goetgheluck's algorithm      */
     671  /*********************************************************/
     672  
     673  void
     674  mpz_bin_uiui (mpz_ptr r, unsigned long int n, unsigned long int k)
     675  {
     676    if (UNLIKELY (n < k)) {
     677      SIZ (r) = 0;
     678  #if BITS_PER_ULONG > GMP_NUMB_BITS
     679    } else if (UNLIKELY (n > GMP_NUMB_MAX)) {
     680      mpz_t tmp;
     681  
     682      mpz_init_set_ui (tmp, n);
     683      mpz_bin_ui (r, tmp, k);
     684      mpz_clear (tmp);
     685  #endif
     686    } else {
     687      ASSERT (n <= GMP_NUMB_MAX);
     688      /* Rewrite bin(n,k) as bin(n,n-k) if that is smaller. */
     689      k = MIN (k, n - k);
     690      if (k < 2) {
     691        MPZ_NEWALLOC (r, 1)[0] = k ? n : 1; /* 1 + ((-k) & (n-1)); */
     692        SIZ(r) = 1;
     693      } else if (n <= ODD_FACTORIAL_EXTTABLE_LIMIT) { /* k >= 2, n >= 4 */
     694        MPZ_NEWALLOC (r, 1)[0] = bc_bin_uiui (n, k);
     695        SIZ(r) = 1;
     696      } else if (k <= ODD_FACTORIAL_TABLE_LIMIT)
     697        mpz_smallk_bin_uiui (r, n, k);
     698      else if (BIN_UIUI_ENABLE_SMALLDC &&
     699  	     k <= (BIN_UIUI_RECURSIVE_SMALLDC ? ODD_CENTRAL_BINOMIAL_TABLE_LIMIT : ODD_FACTORIAL_TABLE_LIMIT)* 2)
     700        mpz_smallkdc_bin_uiui (r, n, k);
     701      else if (ABOVE_THRESHOLD (k, BIN_GOETGHELUCK_THRESHOLD) &&
     702  	     k > (n >> 4)) /* k > ODD_FACTORIAL_TABLE_LIMIT */
     703        mpz_goetgheluck_bin_uiui (r, n, k);
     704      else
     705        mpz_bdiv_bin_uiui (r, n, k);
     706    }
     707  }