1  /* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62.
       2  
       3     Contributed to the GNU project by Niels Möller.
       4     Improvements by Marco Bodrato.
       5  
       6     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       7     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       8     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       9  
      10  Copyright 2006, 2007, 2009, 2014, 2015 Free Software Foundation, Inc.
      11  
      12  This file is part of the GNU MP Library.
      13  
      14  The GNU MP Library is free software; you can redistribute it and/or modify
      15  it under the terms of either:
      16  
      17    * the GNU Lesser General Public License as published by the Free
      18      Software Foundation; either version 3 of the License, or (at your
      19      option) any later version.
      20  
      21  or
      22  
      23    * the GNU General Public License as published by the Free Software
      24      Foundation; either version 2 of the License, or (at your option) any
      25      later version.
      26  
      27  or both in parallel, as here.
      28  
      29  The GNU MP Library is distributed in the hope that it will be useful, but
      30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      32  for more details.
      33  
      34  You should have received copies of the GNU General Public License and the
      35  GNU Lesser General Public License along with the GNU MP Library.  If not,
      36  see https://www.gnu.org/licenses/.  */
      37  
      38  #include "gmp-impl.h"
      39  
      40  #define BINVERT_3 MODLIMB_INVERSE_3
      41  
      42  #define BINVERT_9 \
      43    ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
      44  
      45  #define BINVERT_15 \
      46    ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15)
      47  
      48  /* For the various mpn_divexact_byN here, fall back to using either
      49     mpn_pi1_bdiv_q_1 or mpn_divexact_1.  The former has less overhead and is
      50     many faster if it is native.  For now, since mpn_divexact_1 is native on
      51     several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use
      52     mpn_pi1_bdiv_q_1 unconditionally.  FIXME.  */
      53  
      54  /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
      55  #ifndef mpn_divexact_by3
      56  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
      57  #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
      58  #else
      59  #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
      60  #endif
      61  #endif
      62  
      63  #ifndef mpn_divexact_by9
      64  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
      65  #define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0)
      66  #else
      67  #define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9)
      68  #endif
      69  #endif
      70  
      71  #ifndef mpn_divexact_by15
      72  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
      73  #define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0)
      74  #else
      75  #define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15)
      76  #endif
      77  #endif
      78  
      79  /* Interpolation for toom4, using the evaluation points 0, infinity,
      80     1, -1, 2, -2, 1/2. More precisely, we want to compute
      81     f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the
      82     seven values
      83  
      84       w0 = f(0),
      85       w1 = f(-2),
      86       w2 = f(1),
      87       w3 = f(-1),
      88       w4 = f(2)
      89       w5 = 64 * f(1/2)
      90       w6 = limit at infinity of f(x) / x^6,
      91  
      92     The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n },
      93     w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n,
      94     w6n }. The other values are 2n + 1 limbs each (with most
      95     significant limbs small). f(-1) and f(-1/2) may be negative, signs
      96     determined by the flag bits. Inputs are destroyed.
      97  
      98     Needs (2*n + 1) limbs of temporary storage.
      99  */
     100  
     101  void
     102  mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags,
     103  			   mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5,
     104  			   mp_size_t w6n, mp_ptr tp)
     105  {
     106    mp_size_t m;
     107    mp_limb_t cy;
     108  
     109    m = 2*n + 1;
     110  #define w0 rp
     111  #define w2 (rp + 2*n)
     112  #define w6 (rp + 6*n)
     113  
     114    ASSERT (w6n > 0);
     115    ASSERT (w6n <= 2*n);
     116  
     117    /* Using formulas similar to Marco Bodrato's
     118  
     119       W5 = W5 + W4
     120       W1 =(W4 - W1)/2
     121       W4 = W4 - W0
     122       W4 =(W4 - W1)/4 - W6*16
     123       W3 =(W2 - W3)/2
     124       W2 = W2 - W3
     125  
     126       W5 = W5 - W2*65      May be negative.
     127       W2 = W2 - W6 - W0
     128       W5 =(W5 + W2*45)/2   Now >= 0 again.
     129       W4 =(W4 - W2)/3
     130       W2 = W2 - W4
     131  
     132       W1 = W5 - W1         May be negative.
     133       W5 =(W5 - W3*8)/9
     134       W3 = W3 - W5
     135       W1 =(W1/15 + W5)/2   Now >= 0 again.
     136       W5 = W5 - W1
     137  
     138       where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1),
     139  	   W4 = f(2), W5 = f(1/2), W6 = f(oo),
     140  
     141       Note that most intermediate results are positive; the ones that
     142       may be negative are represented in two's complement. We must
     143       never shift right a value that may be negative, since that would
     144       invalidate the sign bit. On the other hand, divexact by odd
     145       numbers work fine with two's complement.
     146    */
     147  
     148    mpn_add_n (w5, w5, w4, m);
     149    if (flags & toom7_w1_neg)
     150      {
     151  #ifdef HAVE_NATIVE_mpn_rsh1add_n
     152        mpn_rsh1add_n (w1, w1, w4, m);
     153  #else
     154        mpn_add_n (w1, w1, w4, m);  ASSERT (!(w1[0] & 1));
     155        mpn_rshift (w1, w1, m, 1);
     156  #endif
     157      }
     158    else
     159      {
     160  #ifdef HAVE_NATIVE_mpn_rsh1sub_n
     161        mpn_rsh1sub_n (w1, w4, w1, m);
     162  #else
     163        mpn_sub_n (w1, w4, w1, m);  ASSERT (!(w1[0] & 1));
     164        mpn_rshift (w1, w1, m, 1);
     165  #endif
     166      }
     167    mpn_sub (w4, w4, m, w0, 2*n);
     168    mpn_sub_n (w4, w4, w1, m);  ASSERT (!(w4[0] & 3));
     169    mpn_rshift (w4, w4, m, 2); /* w4>=0 */
     170  
     171    tp[w6n] = mpn_lshift (tp, w6, w6n, 4);
     172    mpn_sub (w4, w4, m, tp, w6n+1);
     173  
     174    if (flags & toom7_w3_neg)
     175      {
     176  #ifdef HAVE_NATIVE_mpn_rsh1add_n
     177        mpn_rsh1add_n (w3, w3, w2, m);
     178  #else
     179        mpn_add_n (w3, w3, w2, m);  ASSERT (!(w3[0] & 1));
     180        mpn_rshift (w3, w3, m, 1);
     181  #endif
     182      }
     183    else
     184      {
     185  #ifdef HAVE_NATIVE_mpn_rsh1sub_n
     186        mpn_rsh1sub_n (w3, w2, w3, m);
     187  #else
     188        mpn_sub_n (w3, w2, w3, m);  ASSERT (!(w3[0] & 1));
     189        mpn_rshift (w3, w3, m, 1);
     190  #endif
     191      }
     192  
     193    mpn_sub_n (w2, w2, w3, m);
     194  
     195    mpn_submul_1 (w5, w2, m, 65);
     196    mpn_sub (w2, w2, m, w6, w6n);
     197    mpn_sub (w2, w2, m, w0, 2*n);
     198  
     199    mpn_addmul_1 (w5, w2, m, 45);  ASSERT (!(w5[0] & 1));
     200    mpn_rshift (w5, w5, m, 1);
     201    mpn_sub_n (w4, w4, w2, m);
     202  
     203    mpn_divexact_by3 (w4, w4, m);
     204    mpn_sub_n (w2, w2, w4, m);
     205  
     206    mpn_sub_n (w1, w5, w1, m);
     207    mpn_lshift (tp, w3, m, 3);
     208    mpn_sub_n (w5, w5, tp, m);
     209    mpn_divexact_by9 (w5, w5, m);
     210    mpn_sub_n (w3, w3, w5, m);
     211  
     212    mpn_divexact_by15 (w1, w1, m);
     213  #ifdef HAVE_NATIVE_mpn_rsh1add_n
     214    mpn_rsh1add_n (w1, w1, w5, m);
     215    w1[m - 1] &= GMP_NUMB_MASK >> 1;
     216  #else
     217    mpn_add_n (w1, w1, w5, m);  ASSERT (!(w1[0] & 1));
     218    mpn_rshift (w1, w1, m, 1); /* w1>=0 now */
     219  #endif
     220  
     221    mpn_sub_n (w5, w5, w1, m);
     222  
     223    /* These bounds are valid for the 4x4 polynomial product of toom44,
     224     * and they are conservative for toom53 and toom62. */
     225    ASSERT (w1[2*n] < 2);
     226    ASSERT (w2[2*n] < 3);
     227    ASSERT (w3[2*n] < 4);
     228    ASSERT (w4[2*n] < 3);
     229    ASSERT (w5[2*n] < 2);
     230  
     231    /* Addition chain. Note carries and the 2n'th limbs that need to be
     232     * added in.
     233     *
     234     * Special care is needed for w2[2n] and the corresponding carry,
     235     * since the "simple" way of adding it all together would overwrite
     236     * the limb at wp[2*n] and rp[4*n] (same location) with the sum of
     237     * the high half of w3 and the low half of w4.
     238     *
     239     *         7    6    5    4    3    2    1    0
     240     *    |    |    |    |    |    |    |    |    |
     241     *                  ||w3 (2n+1)|
     242     *             ||w4 (2n+1)|
     243     *        ||w5 (2n+1)|        ||w1 (2n+1)|
     244     *  + | w6 (w6n)|        ||w2 (2n+1)| w0 (2n) |  (share storage with r)
     245     *  -----------------------------------------------
     246     *  r |    |    |    |    |    |    |    |    |
     247     *        c7   c6   c5   c4   c3                 Carries to propagate
     248     */
     249  
     250    cy = mpn_add_n (rp + n, rp + n, w1, m);
     251    MPN_INCR_U (w2 + n + 1, n , cy);
     252    cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n);
     253    MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy);
     254    cy = mpn_add_n (rp + 4*n, w3 + n, w4, n);
     255    MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy);
     256    cy = mpn_add_n (rp + 5*n, w4 + n, w5, n);
     257    MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy);
     258    if (w6n > n + 1)
     259      {
     260        cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1);
     261        MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy);
     262      }
     263    else
     264      {
     265        ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n));
     266  #if WANT_ASSERT
     267        {
     268  	mp_size_t i;
     269  	for (i = w6n; i <= n; i++)
     270  	  ASSERT (w5[n + i] == 0);
     271        }
     272  #endif
     273      }
     274  }