1  /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
       2     of their products.
       3  
       4     Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
       5  
       6     THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
       7     FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
       8     THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       9  
      10  Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc.
      11  
      12  This file is part of the GNU MP Library.
      13  
      14  The GNU MP Library is free software; you can redistribute it and/or modify
      15  it under the terms of either:
      16  
      17    * the GNU Lesser General Public License as published by the Free
      18      Software Foundation; either version 3 of the License, or (at your
      19      option) any later version.
      20  
      21  or
      22  
      23    * the GNU General Public License as published by the Free Software
      24      Foundation; either version 2 of the License, or (at your option) any
      25      later version.
      26  
      27  or both in parallel, as here.
      28  
      29  The GNU MP Library is distributed in the hope that it will be useful, but
      30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      32  for more details.
      33  
      34  You should have received copies of the GNU General Public License and the
      35  GNU Lesser General Public License along with the GNU MP Library.  If not,
      36  see https://www.gnu.org/licenses/.  */
      37  
      38  #include "gmp-impl.h"
      39  
      40  
      41  #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
      42  #define MAYBE_range_basecase 1
      43  #define MAYBE_range_toom22   1
      44  #else
      45  #define MAYBE_range_basecase                                           \
      46    ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
      47  #define MAYBE_range_toom22                                             \
      48    ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
      49  #endif
      50  
      51  /*  THINK: The DC strategy uses different constants in different Toom's
      52  	 ranges. Something smoother?
      53  */
      54  
      55  /*
      56    Compute the least significant half of the product {xy,n}*{yp,n}, or
      57    formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
      58  
      59    Above the given threshold, the Divide and Conquer strategy is used.
      60    The operands are split in two, and a full product plus two mullo
      61    are used to obtain the final result. The more natural strategy is to
      62    split in two halves, but this is far from optimal when a
      63    sub-quadratic multiplication is used.
      64  
      65    Mulders suggests an unbalanced split in favour of the full product,
      66    split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
      67  
      68    To compute the value of a, we assume that the cost of mullo for a
      69    given size ML(n) is a fraction of the cost of a full product with
      70    same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
      71    then we can write:
      72  
      73    ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
      74  
      75    Given a value for e, want to minimise the value of k, i.e. the
      76    function k=(1-a)^e/(1-2*a^e).
      77  
      78    With e=2, the exponent for schoolbook multiplication, the minimum is
      79    given by the values a=1-a=1/2.
      80  
      81    With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
      82    Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
      83  
      84    Other possible approximations follow:
      85    e=log(5)/log(3) [Toom-3] -> a ~= 9/40
      86    e=log(7)/log(4) [Toom-4] -> a ~= 7/39
      87    e=log(11)/log(6) [Toom-6] -> a ~= 1/8
      88    e=log(15)/log(8) [Toom-8] -> a ~= 1/10
      89  
      90    The values above where obtained with the following trivial commands
      91    in the gp-pari shell:
      92  
      93  fun(e,a)=(1-a)^e/(1-2*a^e)
      94  mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
      95  contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
      96  contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
      97  contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
      98  contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
      99  contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
     100  
     101    ,
     102    |\
     103    | \
     104    +----,
     105    |    |
     106    |    |
     107    |    |\
     108    |    | \
     109    +----+--`
     110    ^ n2 ^n1^
     111  
     112    For an actual implementation, the assumption that M(n)=n^e is
     113    incorrect, as a consequence also the assumption that ML(n)=k*M(n)
     114    with a constant k is wrong.
     115  
     116    But theory suggest us two things:
     117    - the best the multiplication product is (lower e), the more k
     118      approaches 1, and a approaches 0.
     119  
     120    - A value for a smaller than optimal is probably less bad than a
     121      bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
     122      value, and k(a)=0.808_ the mul/mullo speed ratio. We get
     123      k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
     124  */
     125  
     126  static mp_size_t
     127  mpn_mullo_n_itch (mp_size_t n)
     128  {
     129    return 2*n;
     130  }
     131  
     132  /*
     133      mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
     134      It accepts tp == rp.
     135  */
     136  static void
     137  mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
     138  {
     139    mp_size_t n2, n1;
     140    ASSERT (n >= 2);
     141    ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
     142    ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
     143    ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
     144  
     145    /* Divide-and-conquer */
     146  
     147    /* We need fractional approximation of the value 0 < a <= 1/2
     148       giving the minimum in the function k=(1-a)^e/(1-2*a^e).
     149    */
     150    if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
     151      n1 = n >> 1;
     152    else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
     153      n1 = n * 11 / (size_t) 36;	/* n1 ~= n*(1-.694...) */
     154    else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
     155      n1 = n * 9 / (size_t) 40;	/* n1 ~= n*(1-.775...) */
     156    else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
     157      n1 = n * 7 / (size_t) 39;	/* n1 ~= n*(1-.821...) */
     158    /* n1 = n * 4 / (size_t) 31;	// n1 ~= n*(1-.871...) [TOOM66] */
     159    else
     160      n1 = n / (size_t) 10;		/* n1 ~= n*(1-.899...) [TOOM88] */
     161  
     162    n2 = n - n1;
     163  
     164    /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
     165  	      y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
     166  
     167    /* x0 * y0 */
     168    mpn_mul_n (tp, xp, yp, n2);
     169    MPN_COPY (rp, tp, n2);
     170  
     171    /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
     172    if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
     173      mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
     174    else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
     175      mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
     176    else
     177      mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
     178    mpn_add_n (rp + n2, tp + n2, tp + n, n1);
     179  
     180    /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
     181    if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
     182      mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
     183    else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
     184      mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
     185    else
     186      mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
     187    mpn_add_n (rp + n2, rp + n2, tp + n, n1);
     188  }
     189  
     190  /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
     191  #define MUL_BASECASE_ALLOC \
     192   (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
     193  
     194  /* FIXME: This function should accept a temporary area; dc_mullow_n
     195     accepts a pointer tp, and handle the case tp == rp, do the same here.
     196     Maybe recombine the two functions.
     197     THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
     198  	  (typically thanks to mpn_addmul_2) should we unconditionally use
     199  	  mpn_mul_n?
     200  */
     201  
     202  void
     203  mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
     204  {
     205    ASSERT (n >= 1);
     206    ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
     207    ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
     208  
     209    if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
     210      {
     211        /* Allocate workspace of fixed size on stack: fast! */
     212        mp_limb_t tp[MUL_BASECASE_ALLOC];
     213        mpn_mul_basecase (tp, xp, n, yp, n);
     214        MPN_COPY (rp, tp, n);
     215      }
     216    else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
     217      {
     218        mpn_mullo_basecase (rp, xp, yp, n);
     219      }
     220    else
     221      {
     222        mp_ptr tp;
     223        TMP_DECL;
     224        TMP_MARK;
     225        tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
     226        if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
     227  	{
     228  	  mpn_dc_mullo_n (rp, xp, yp, n, tp);
     229  	}
     230        else
     231  	{
     232  	  /* For really large operands, use plain mpn_mul_n but throw away upper n
     233  	     limbs of result.  */
     234  #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
     235  	  mpn_fft_mul (tp, xp, n, yp, n);
     236  #else
     237  	  mpn_mul_n (tp, xp, yp, n);
     238  #endif
     239  	  MPN_COPY (rp, tp, n);
     240  	}
     241        TMP_FREE;
     242      }
     243  }