1  /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
       2     size.  Or more accurately, bn <= an < (3/2)bn.
       3  
       4     Contributed to the GNU project by Torbjorn Granlund.
       5     Additional improvements by Marco Bodrato.
       6  
       7     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       8     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       9     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      10  
      11  Copyright 2006-2008, 2010, 2012, 2015, 2021 Free Software Foundation, Inc.
      12  
      13  This file is part of the GNU MP Library.
      14  
      15  The GNU MP Library is free software; you can redistribute it and/or modify
      16  it under the terms of either:
      17  
      18    * the GNU Lesser General Public License as published by the Free
      19      Software Foundation; either version 3 of the License, or (at your
      20      option) any later version.
      21  
      22  or
      23  
      24    * the GNU General Public License as published by the Free Software
      25      Foundation; either version 2 of the License, or (at your option) any
      26      later version.
      27  
      28  or both in parallel, as here.
      29  
      30  The GNU MP Library is distributed in the hope that it will be useful, but
      31  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      32  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      33  for more details.
      34  
      35  You should have received copies of the GNU General Public License and the
      36  GNU Lesser General Public License along with the GNU MP Library.  If not,
      37  see https://www.gnu.org/licenses/.  */
      38  
      39  
      40  #include "gmp-impl.h"
      41  
      42  /* Evaluate in: -1, 0, +1, +2, +inf
      43  
      44    <-s--><--n--><--n-->
      45     ____ ______ ______
      46    |_a2_|___a1_|___a0_|
      47     |b2_|___b1_|___b0_|
      48     <-t-><--n--><--n-->
      49  
      50    v0  =  a0         * b0          #   A(0)*B(0)
      51    v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
      52    vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
      53    v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
      54    vinf=          a2 *         b2  # A(inf)*B(inf)
      55  */
      56  
      57  #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
      58  #define MAYBE_mul_basecase 1
      59  #define MAYBE_mul_toom33   1
      60  #else
      61  #define MAYBE_mul_basecase						\
      62    (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
      63  #define MAYBE_mul_toom33						\
      64    (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
      65  #endif
      66  
      67  /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
      68     multiplication at the infinity point. We may have
      69     MAYBE_mul_basecase == 0, and still get s just below
      70     MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
      71     s == 1 and mpn_toom22_mul will crash.
      72  */
      73  
      74  #define TOOM33_MUL_N_REC(p, a, b, n, ws)				\
      75    do {									\
      76      if (MAYBE_mul_basecase						\
      77  	&& BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))			\
      78        mpn_mul_basecase (p, a, n, b, n);					\
      79      else if (! MAYBE_mul_toom33						\
      80  	     || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))		\
      81        mpn_toom22_mul (p, a, n, b, n, ws);				\
      82      else								\
      83        mpn_toom33_mul (p, a, n, b, n, ws);				\
      84    } while (0)
      85  
      86  void
      87  mpn_toom33_mul (mp_ptr pp,
      88  		mp_srcptr ap, mp_size_t an,
      89  		mp_srcptr bp, mp_size_t bn,
      90  		mp_ptr scratch)
      91  {
      92    const int __gmpn_cpuvec_initialized = 1;
      93    mp_size_t n, s, t;
      94    int vm1_neg;
      95    mp_limb_t cy, vinf0;
      96    mp_ptr gp;
      97    mp_ptr as1, asm1, as2;
      98    mp_ptr bs1, bsm1, bs2;
      99  
     100  #define a0  ap
     101  #define a1  (ap + n)
     102  #define a2  (ap + 2*n)
     103  #define b0  bp
     104  #define b1  (bp + n)
     105  #define b2  (bp + 2*n)
     106  
     107    n = (an + 2) / (size_t) 3;
     108  
     109    s = an - 2 * n;
     110    t = bn - 2 * n;
     111  
     112    ASSERT (an >= bn);
     113  
     114    ASSERT (0 < s && s <= n);
     115    ASSERT (0 < t && t <= n);
     116  
     117    as1  = scratch + 4 * n + 4;
     118    asm1 = scratch + 2 * n + 2;
     119    as2 = pp + n + 1;
     120  
     121    bs1 = pp;
     122    bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
     123    bs2 = pp + 2 * n + 2;
     124  
     125    gp = scratch;
     126  
     127    vm1_neg = 0;
     128  
     129    /* Compute as1 and asm1.  */
     130    cy = mpn_add (gp, a0, n, a2, s);
     131  #if HAVE_NATIVE_mpn_add_n_sub_n
     132    if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
     133      {
     134        cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
     135        as1[n] = cy >> 1;
     136        asm1[n] = 0;
     137        vm1_neg = 1;
     138      }
     139    else
     140      {
     141        mp_limb_t cy2;
     142        cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
     143        as1[n] = cy + (cy2 >> 1);
     144        asm1[n] = cy - (cy2 & 1);
     145      }
     146  #else
     147    as1[n] = cy + mpn_add_n (as1, gp, a1, n);
     148    if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
     149      {
     150        mpn_sub_n (asm1, a1, gp, n);
     151        asm1[n] = 0;
     152        vm1_neg = 1;
     153      }
     154    else
     155      {
     156        cy -= mpn_sub_n (asm1, gp, a1, n);
     157        asm1[n] = cy;
     158      }
     159  #endif
     160  
     161    /* Compute as2.  */
     162  #if HAVE_NATIVE_mpn_rsblsh1_n
     163    cy = mpn_add_n (as2, a2, as1, s);
     164    if (s != n)
     165      cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
     166    cy += as1[n];
     167    cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
     168  #else
     169  #if HAVE_NATIVE_mpn_addlsh1_n
     170    cy  = mpn_addlsh1_n (as2, a1, a2, s);
     171    if (s != n)
     172      cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
     173    cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
     174  #else
     175    cy = mpn_add_n (as2, a2, as1, s);
     176    if (s != n)
     177      cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
     178    cy += as1[n];
     179    cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
     180    cy -= mpn_sub_n (as2, as2, a0, n);
     181  #endif
     182  #endif
     183    as2[n] = cy;
     184  
     185    /* Compute bs1 and bsm1.  */
     186    cy = mpn_add (gp, b0, n, b2, t);
     187  #if HAVE_NATIVE_mpn_add_n_sub_n
     188    if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
     189      {
     190        cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
     191        bs1[n] = cy >> 1;
     192        bsm1[n] = 0;
     193        vm1_neg ^= 1;
     194      }
     195    else
     196      {
     197        mp_limb_t cy2;
     198        cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
     199        bs1[n] = cy + (cy2 >> 1);
     200        bsm1[n] = cy - (cy2 & 1);
     201      }
     202  #else
     203    bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
     204    if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
     205      {
     206        mpn_sub_n (bsm1, b1, gp, n);
     207        bsm1[n] = 0;
     208        vm1_neg ^= 1;
     209      }
     210    else
     211      {
     212        cy -= mpn_sub_n (bsm1, gp, b1, n);
     213        bsm1[n] = cy;
     214      }
     215  #endif
     216  
     217    /* Compute bs2.  */
     218  #if HAVE_NATIVE_mpn_rsblsh1_n
     219    cy = mpn_add_n (bs2, b2, bs1, t);
     220    if (t != n)
     221      cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
     222    cy += bs1[n];
     223    cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
     224  #else
     225  #if HAVE_NATIVE_mpn_addlsh1_n
     226    cy  = mpn_addlsh1_n (bs2, b1, b2, t);
     227    if (t != n)
     228      cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
     229    cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
     230  #else
     231    cy  = mpn_add_n (bs2, bs1, b2, t);
     232    if (t != n)
     233      cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
     234    cy += bs1[n];
     235    cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
     236    cy -= mpn_sub_n (bs2, bs2, b0, n);
     237  #endif
     238  #endif
     239    bs2[n] = cy;
     240  
     241    ASSERT (as1[n] <= 2);
     242    ASSERT (bs1[n] <= 2);
     243    ASSERT (asm1[n] <= 1);
     244    ASSERT (bsm1[n] <= 1);
     245    ASSERT (as2[n] <= 6);
     246    ASSERT (bs2[n] <= 6);
     247  
     248  #define v0    pp				/* 2n */
     249  #define v1    (pp + 2 * n)			/* 2n+1 */
     250  #define vinf  (pp + 4 * n)			/* s+t */
     251  #define vm1   scratch				/* 2n+1 */
     252  #define v2    (scratch + 2 * n + 1)		/* 2n+2 */
     253  #define scratch_out  (scratch + 5 * n + 5)
     254  
     255    /* vm1, 2n+1 limbs */
     256  #ifdef SMALLER_RECURSION
     257    TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
     258    cy = 0;
     259    if (asm1[n] != 0)
     260      cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
     261    if (bsm1[n] != 0)
     262      cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
     263    vm1[2 * n] = cy;
     264  #else
     265    vm1[2 * n] = 0;
     266    TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + (bsm1[n] | asm1[n]), scratch_out);
     267  #endif
     268  
     269    TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);	/* v2, 2n+1 limbs */
     270  
     271    /* vinf, s+t limbs */
     272    if (s > t)  mpn_mul (vinf, a2, s, b2, t);
     273    else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
     274  
     275    vinf0 = vinf[0];				/* v1 overlaps with this */
     276  
     277  #ifdef SMALLER_RECURSION
     278    /* v1, 2n+1 limbs */
     279    TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
     280    if (as1[n] == 1)
     281      {
     282        cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
     283      }
     284    else if (as1[n] != 0)
     285      {
     286  #if HAVE_NATIVE_mpn_addlsh1_n_ip1
     287        cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
     288  #else
     289        cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
     290  #endif
     291      }
     292    else
     293      cy = 0;
     294    if (bs1[n] == 1)
     295      {
     296        cy += mpn_add_n (v1 + n, v1 + n, as1, n);
     297      }
     298    else if (bs1[n] != 0)
     299      {
     300  #if HAVE_NATIVE_mpn_addlsh1_n_ip1
     301        cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
     302  #else
     303        cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
     304  #endif
     305      }
     306    v1[2 * n] = cy;
     307  #else
     308    cy = vinf[1];
     309    TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
     310    vinf[1] = cy;
     311  #endif
     312  
     313    TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);	/* v0, 2n limbs */
     314  
     315    mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
     316  }