1  /* mpn_mul -- Multiply two natural numbers.
       2  
       3     Contributed to the GNU project by Torbjorn Granlund.
       4  
       5  Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012,
       6  2014, 2019 Free Software Foundation, Inc.
       7  
       8  This file is part of the GNU MP Library.
       9  
      10  The GNU MP Library is free software; you can redistribute it and/or modify
      11  it under the terms of either:
      12  
      13    * the GNU Lesser General Public License as published by the Free
      14      Software Foundation; either version 3 of the License, or (at your
      15      option) any later version.
      16  
      17  or
      18  
      19    * the GNU General Public License as published by the Free Software
      20      Foundation; either version 2 of the License, or (at your option) any
      21      later version.
      22  
      23  or both in parallel, as here.
      24  
      25  The GNU MP Library is distributed in the hope that it will be useful, but
      26  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      27  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      28  for more details.
      29  
      30  You should have received copies of the GNU General Public License and the
      31  GNU Lesser General Public License along with the GNU MP Library.  If not,
      32  see https://www.gnu.org/licenses/.  */
      33  
      34  #include "gmp-impl.h"
      35  
      36  
      37  #ifndef MUL_BASECASE_MAX_UN
      38  #define MUL_BASECASE_MAX_UN 500
      39  #endif
      40  
      41  /* Areas where the different toom algorithms can be called (extracted
      42     from the t-toom*.c files, and ignoring small constant offsets):
      43  
      44     1/6  1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5   1 vn/un
      45                                          4/7              6/7
      46  				       6/11
      47                                         |--------------------| toom22 (small)
      48                                                             || toom22 (large)
      49                                                         |xxxx| toom22 called
      50                        |-------------------------------------| toom32
      51                                           |xxxxxxxxxxxxxxxx| | toom32 called
      52                                                 |------------| toom33
      53                                                            |x| toom33 called
      54               |---------------------------------|            | toom42
      55  	              |xxxxxxxxxxxxxxxxxxxxxxxx|            | toom42 called
      56                                         |--------------------| toom43
      57                                                 |xxxxxxxxxx|   toom43 called
      58           |-----------------------------|                      toom52 (unused)
      59                                                     |--------| toom44
      60  						   |xxxxxxxx| toom44 called
      61                                |--------------------|        | toom53
      62                                          |xxxxxx|              toom53 called
      63      |-------------------------|                               toom62 (unused)
      64                                             |----------------| toom54 (unused)
      65                        |--------------------|                  toom63
      66  	                      |xxxxxxxxx|                   | toom63 called
      67                            |---------------------------------| toom6h
      68  						   |xxxxxxxx| toom6h called
      69                                    |-------------------------| toom8h (32 bit)
      70                   |------------------------------------------| toom8h (64 bit)
      71  						   |xxxxxxxx| toom8h called
      72  */
      73  
      74  #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
      75  #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
      76  
      77  /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
      78     (pointed to by VP, with VN limbs), and store the result at PRODP.  The
      79     result is UN + VN limbs.  Return the most significant limb of the result.
      80  
      81     NOTE: The space pointed to by PRODP is overwritten before finished with U
      82     and V, so overlap is an error.
      83  
      84     Argument constraints:
      85     1. UN >= VN.
      86     2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
      87        the multiplier and the multiplicand.  */
      88  
      89  /*
      90    * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
      91      ideal lines of the surrounding algorithms.  Is that optimal?
      92  
      93    * The toomX3 code now uses a structure similar to the one of toomX2, except
      94      that it loops longer in the unbalanced case.  The result is that the
      95      remaining area might have un < vn.  Should we fix the toomX2 code in a
      96      similar way?
      97  
      98    * The toomX3 code is used for the largest non-FFT unbalanced operands.  It
      99      therefore calls mpn_mul recursively for certain cases.
     100  
     101    * Allocate static temp space using THRESHOLD variables (except for toom44
     102      when !WANT_FFT).  That way, we can typically have no TMP_ALLOC at all.
     103  
     104    * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
     105      have the same vn threshold.  This is not true, we should actually use
     106      mul_basecase for slightly larger operands for toom32 than for toom22, and
     107      even larger for toom42.
     108  
     109    * That problem is even more prevalent for toomX3.  We therefore use special
     110      THRESHOLD variables there.
     111  */
     112  
     113  mp_limb_t
     114  mpn_mul (mp_ptr prodp,
     115  	 mp_srcptr up, mp_size_t un,
     116  	 mp_srcptr vp, mp_size_t vn)
     117  {
     118    ASSERT (un >= vn);
     119    ASSERT (vn >= 1);
     120    ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
     121    ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
     122  
     123    if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD))
     124      {
     125        /* When un (and thus vn) is below the toom22 range, do mul_basecase.
     126  	 Test un and not vn here not to thwart the un >> vn code below.
     127  	 This special case is not necessary, but cuts the overhead for the
     128  	 smallest operands. */
     129        mpn_mul_basecase (prodp, up, un, vp, vn);
     130      }
     131    else if (un == vn)
     132      {
     133        mpn_mul_n (prodp, up, vp, un);
     134      }
     135    else if (vn < MUL_TOOM22_THRESHOLD)
     136      { /* plain schoolbook multiplication */
     137  
     138        /* Unless un is very large, or else if have an applicable mpn_mul_N,
     139  	 perform basecase multiply directly.  */
     140        if (un <= MUL_BASECASE_MAX_UN
     141  #if HAVE_NATIVE_mpn_mul_2
     142  	  || vn <= 2
     143  #else
     144  	  || vn == 1
     145  #endif
     146  	  )
     147  	mpn_mul_basecase (prodp, up, un, vp, vn);
     148        else
     149  	{
     150  	  /* We have un >> MUL_BASECASE_MAX_UN > vn.  For better memory
     151  	     locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
     152  	     these pieces with the vp[] operand.  After each such partial
     153  	     multiplication (but the last) we copy the most significant vn
     154  	     limbs into a temporary buffer since that part would otherwise be
     155  	     overwritten by the next multiplication.  After the next
     156  	     multiplication, we add it back.  This illustrates the situation:
     157  
     158                                                      -->vn<--
     159                                                        |  |<------- un ------->|
     160                                                           _____________________|
     161                                                          X                    /|
     162                                                        /XX__________________/  |
     163                                      _____________________                     |
     164                                     X                    /                     |
     165                                   /XX__________________/                       |
     166                 _____________________                                          |
     167                /                    /                                          |
     168              /____________________/                                            |
     169  	    ==================================================================
     170  
     171  	    The parts marked with X are the parts whose sums are copied into
     172  	    the temporary buffer.  */
     173  
     174  	  mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
     175  	  mp_limb_t cy;
     176  	  ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
     177  
     178  	  mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
     179  	  prodp += MUL_BASECASE_MAX_UN;
     180  	  MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
     181  	  up += MUL_BASECASE_MAX_UN;
     182  	  un -= MUL_BASECASE_MAX_UN;
     183  	  while (un > MUL_BASECASE_MAX_UN)
     184  	    {
     185  	      mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
     186  	      cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
     187  	      mpn_incr_u (prodp + vn, cy);
     188  	      prodp += MUL_BASECASE_MAX_UN;
     189  	      MPN_COPY (tp, prodp, vn);		/* preserve high triangle */
     190  	      up += MUL_BASECASE_MAX_UN;
     191  	      un -= MUL_BASECASE_MAX_UN;
     192  	    }
     193  	  if (un > vn)
     194  	    {
     195  	      mpn_mul_basecase (prodp, up, un, vp, vn);
     196  	    }
     197  	  else
     198  	    {
     199  	      ASSERT (un > 0);
     200  	      mpn_mul_basecase (prodp, vp, vn, up, un);
     201  	    }
     202  	  cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
     203  	  mpn_incr_u (prodp + vn, cy);
     204  	}
     205      }
     206    else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
     207      {
     208        /* Use ToomX2 variants */
     209        mp_ptr scratch;
     210        TMP_SDECL; TMP_SMARK;
     211  
     212  #define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2)
     213        scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2);
     214        ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */
     215        ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */
     216        ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */
     217  #undef ITCH_TOOMX2
     218  
     219        /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
     220  	 square to a (3vn-1)*vn rectangle.  Leaving such a rectangle is hardly
     221  	 wise; we would get better balance by slightly moving the bound.  We
     222  	 will sometimes end up with un < vn, like in the X3 arm below.  */
     223        if (un >= 3 * vn)
     224  	{
     225  	  mp_limb_t cy;
     226  	  mp_ptr ws;
     227  
     228  	  /* The maximum ws usage is for the mpn_mul result.  */
     229  	  ws = TMP_SALLOC_LIMBS (4 * vn);
     230  
     231  	  mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
     232  	  un -= 2 * vn;
     233  	  up += 2 * vn;
     234  	  prodp += 2 * vn;
     235  
     236  	  while (un >= 3 * vn)
     237  	    {
     238  	      mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
     239  	      un -= 2 * vn;
     240  	      up += 2 * vn;
     241  	      cy = mpn_add_n (prodp, prodp, ws, vn);
     242  	      MPN_COPY (prodp + vn, ws + vn, 2 * vn);
     243  	      mpn_incr_u (prodp + vn, cy);
     244  	      prodp += 2 * vn;
     245  	    }
     246  
     247  	  /* vn <= un < 3vn */
     248  
     249  	  if (4 * un < 5 * vn)
     250  	    mpn_toom22_mul (ws, up, un, vp, vn, scratch);
     251  	  else if (4 * un < 7 * vn)
     252  	    mpn_toom32_mul (ws, up, un, vp, vn, scratch);
     253  	  else
     254  	    mpn_toom42_mul (ws, up, un, vp, vn, scratch);
     255  
     256  	  cy = mpn_add_n (prodp, prodp, ws, vn);
     257  	  MPN_COPY (prodp + vn, ws + vn, un);
     258  	  mpn_incr_u (prodp + vn, cy);
     259  	}
     260        else
     261  	{
     262  	  if (4 * un < 5 * vn)
     263  	    mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
     264  	  else if (4 * un < 7 * vn)
     265  	    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
     266  	  else
     267  	    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
     268  	}
     269        TMP_SFREE;
     270      }
     271    else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
     272  	   BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
     273      {
     274        /* Handle the largest operands that are not in the FFT range.  The 2nd
     275  	 condition makes very unbalanced operands avoid the FFT code (except
     276  	 perhaps as coefficient products of the Toom code.  */
     277  
     278        if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
     279  	{
     280  	  /* Use ToomX3 variants */
     281  	  mp_ptr scratch;
     282  	  TMP_DECL; TMP_MARK;
     283  
     284  #define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS)
     285  	  scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3);
     286  	  ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */
     287  	  ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */
     288  	  ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */
     289  	  ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */
     290  	  ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
     291  	  ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */
     292  #undef ITCH_TOOMX3
     293  
     294  	  if (2 * un >= 5 * vn)
     295  	    {
     296  	      mp_limb_t cy;
     297  	      mp_ptr ws;
     298  
     299  	      /* The maximum ws usage is for the mpn_mul result.  */
     300  	      ws = TMP_ALLOC_LIMBS (7 * vn >> 1);
     301  
     302  	      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
     303  		mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
     304  	      else
     305  		mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
     306  	      un -= 2 * vn;
     307  	      up += 2 * vn;
     308  	      prodp += 2 * vn;
     309  
     310  	      while (2 * un >= 5 * vn)	/* un >= 2.5vn */
     311  		{
     312  		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
     313  		    mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
     314  		  else
     315  		    mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
     316  		  un -= 2 * vn;
     317  		  up += 2 * vn;
     318  		  cy = mpn_add_n (prodp, prodp, ws, vn);
     319  		  MPN_COPY (prodp + vn, ws + vn, 2 * vn);
     320  		  mpn_incr_u (prodp + vn, cy);
     321  		  prodp += 2 * vn;
     322  		}
     323  
     324  	      /* vn / 2 <= un < 2.5vn */
     325  
     326  	      if (un < vn)
     327  		mpn_mul (ws, vp, vn, up, un);
     328  	      else
     329  		mpn_mul (ws, up, un, vp, vn);
     330  
     331  	      cy = mpn_add_n (prodp, prodp, ws, vn);
     332  	      MPN_COPY (prodp + vn, ws + vn, un);
     333  	      mpn_incr_u (prodp + vn, cy);
     334  	    }
     335  	  else
     336  	    {
     337  	      if (6 * un < 7 * vn)
     338  		mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
     339  	      else if (2 * un < 3 * vn)
     340  		{
     341  		  if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
     342  		    mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
     343  		  else
     344  		    mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
     345  		}
     346  	      else if (6 * un < 11 * vn)
     347  		{
     348  		  if (4 * un < 7 * vn)
     349  		    {
     350  		      if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
     351  			mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
     352  		      else
     353  			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
     354  		    }
     355  		  else
     356  		    {
     357  		      if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
     358  			mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
     359  		      else
     360  			mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
     361  		    }
     362  		}
     363  	      else
     364  		{
     365  		  if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
     366  		    mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
     367  		  else
     368  		    mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
     369  		}
     370  	    }
     371  	  TMP_FREE;
     372  	}
     373        else
     374  	{
     375  	  mp_ptr scratch;
     376  	  TMP_DECL; TMP_MARK;
     377  
     378  	  if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
     379  	    {
     380  	      scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
     381  	      mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
     382  	    }
     383  	  else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
     384  	    {
     385  	      scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
     386  	      mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
     387  	    }
     388  	  else
     389  	    {
     390  	      scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
     391  	      mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
     392  	    }
     393  	  TMP_FREE;
     394  	}
     395      }
     396    else
     397      {
     398        if (un >= 8 * vn)
     399  	{
     400  	  mp_limb_t cy;
     401  	  mp_ptr ws;
     402  	  TMP_DECL; TMP_MARK;
     403  
     404  	  /* The maximum ws usage is for the mpn_mul result.  */
     405  	  ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
     406  
     407  	  mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
     408  	  un -= 3 * vn;
     409  	  up += 3 * vn;
     410  	  prodp += 3 * vn;
     411  
     412  	  while (2 * un >= 7 * vn)	/* un >= 3.5vn  */
     413  	    {
     414  	      mpn_fft_mul (ws, up, 3 * vn, vp, vn);
     415  	      un -= 3 * vn;
     416  	      up += 3 * vn;
     417  	      cy = mpn_add_n (prodp, prodp, ws, vn);
     418  	      MPN_COPY (prodp + vn, ws + vn, 3 * vn);
     419  	      mpn_incr_u (prodp + vn, cy);
     420  	      prodp += 3 * vn;
     421  	    }
     422  
     423  	  /* vn / 2 <= un < 3.5vn */
     424  
     425  	  if (un < vn)
     426  	    mpn_mul (ws, vp, vn, up, un);
     427  	  else
     428  	    mpn_mul (ws, up, un, vp, vn);
     429  
     430  	  cy = mpn_add_n (prodp, prodp, ws, vn);
     431  	  MPN_COPY (prodp + vn, ws + vn, un);
     432  	  mpn_incr_u (prodp + vn, cy);
     433  
     434  	  TMP_FREE;
     435  	}
     436        else
     437  	mpn_fft_mul (prodp, up, un, vp, vn);
     438      }
     439  
     440    return prodp[un + vn - 1];	/* historic */
     441  }