1  /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52
       2  
       3     Contributed to the GNU project by Marco Bodrato.
       4  
       5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       7     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       8  
       9  Copyright 2009, 2010, 2012 Free Software Foundation, Inc.
      10  
      11  This file is part of the GNU MP Library.
      12  
      13  The GNU MP Library is free software; you can redistribute it and/or modify
      14  it under the terms of either:
      15  
      16    * the GNU Lesser General Public License as published by the Free
      17      Software Foundation; either version 3 of the License, or (at your
      18      option) any later version.
      19  
      20  or
      21  
      22    * the GNU General Public License as published by the Free Software
      23      Foundation; either version 2 of the License, or (at your option) any
      24      later version.
      25  
      26  or both in parallel, as here.
      27  
      28  The GNU MP Library is distributed in the hope that it will be useful, but
      29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      31  for more details.
      32  
      33  You should have received copies of the GNU General Public License and the
      34  GNU Lesser General Public License along with the GNU MP Library.  If not,
      35  see https://www.gnu.org/licenses/.  */
      36  
      37  #include "gmp-impl.h"
      38  
      39  #define BINVERT_3 MODLIMB_INVERSE_3
      40  
      41  /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
      42  #ifndef mpn_divexact_by3
      43  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
      44  #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
      45  #else
      46  #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
      47  #endif
      48  #endif
      49  
      50  /* Interpolation for Toom-3.5, using the evaluation points: infinity,
      51     1, -1, 2, -2. More precisely, we want to compute
      52     f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
      53     six values
      54  
      55       w5 = f(0),
      56       w4 = f(-1),
      57       w3 = f(1)
      58       w2 = f(-2),
      59       w1 = f(2),
      60       w0 = limit at infinity of f(x) / x^5,
      61  
      62     The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
      63     {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
      64     {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
      65     significant limbs small). f(-1) and f(-2) may be negative, signs
      66     determined by the flag bits. All intermediate results are positive.
      67     Inputs are destroyed.
      68  
      69     Interpolation sequence was taken from the paper: "Integer and
      70     Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
      71     Some slight variations were introduced: adaptation to "gmp
      72     instruction set", and a final saving of an operation by interlacing
      73     interpolation and recomposition phases.
      74  */
      75  
      76  void
      77  mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
      78  			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
      79  			   mp_size_t w0n)
      80  {
      81    mp_limb_t cy;
      82    /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
      83    mp_limb_t cy4, cy6, embankment;
      84  
      85    ASSERT( n > 0 );
      86    ASSERT( 2*n >= w0n && w0n > 0 );
      87  
      88  #define w5  pp					/* 2n   */
      89  #define w3  (pp + 2 * n)			/* 2n+1 */
      90  #define w0  (pp + 5 * n)			/* w0n  */
      91  
      92    /* Interpolate with sequence:
      93       W2 =(W1 - W2)>>2
      94       W1 =(W1 - W5)>>1
      95       W1 =(W1 - W2)>>1
      96       W4 =(W3 - W4)>>1
      97       W2 =(W2 - W4)/3
      98       W3 = W3 - W4 - W5
      99       W1 =(W1 - W3)/3
     100       // Last steps are mixed with recomposition...
     101       W2 = W2 - W0<<2
     102       W4 = W4 - W2
     103       W3 = W3 - W1
     104       W2 = W2 - W0
     105    */
     106  
     107    /* W2 =(W1 - W2)>>2 */
     108    if (flags & toom6_vm2_neg)
     109      mpn_add_n (w2, w1, w2, 2 * n + 1);
     110    else
     111      mpn_sub_n (w2, w1, w2, 2 * n + 1);
     112    mpn_rshift (w2, w2, 2 * n + 1, 2);
     113  
     114    /* W1 =(W1 - W5)>>1 */
     115    w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
     116    mpn_rshift (w1, w1, 2 * n + 1, 1);
     117  
     118    /* W1 =(W1 - W2)>>1 */
     119  #if HAVE_NATIVE_mpn_rsh1sub_n
     120    mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
     121  #else
     122    mpn_sub_n (w1, w1, w2, 2 * n + 1);
     123    mpn_rshift (w1, w1, 2 * n + 1, 1);
     124  #endif
     125  
     126    /* W4 =(W3 - W4)>>1 */
     127    if (flags & toom6_vm1_neg)
     128      {
     129  #if HAVE_NATIVE_mpn_rsh1add_n
     130        mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
     131  #else
     132        mpn_add_n (w4, w3, w4, 2 * n + 1);
     133        mpn_rshift (w4, w4, 2 * n + 1, 1);
     134  #endif
     135      }
     136    else
     137      {
     138  #if HAVE_NATIVE_mpn_rsh1sub_n
     139        mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
     140  #else
     141        mpn_sub_n (w4, w3, w4, 2 * n + 1);
     142        mpn_rshift (w4, w4, 2 * n + 1, 1);
     143  #endif
     144      }
     145  
     146    /* W2 =(W2 - W4)/3 */
     147    mpn_sub_n (w2, w2, w4, 2 * n + 1);
     148    mpn_divexact_by3 (w2, w2, 2 * n + 1);
     149  
     150    /* W3 = W3 - W4 - W5 */
     151    mpn_sub_n (w3, w3, w4, 2 * n + 1);
     152    w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);
     153  
     154    /* W1 =(W1 - W3)/3 */
     155    mpn_sub_n (w1, w1, w3, 2 * n + 1);
     156    mpn_divexact_by3 (w1, w1, 2 * n + 1);
     157  
     158    /*
     159      [1 0 0 0 0 0;
     160       0 1 0 0 0 0;
     161       1 0 1 0 0 0;
     162       0 1 0 1 0 0;
     163       1 0 1 0 1 0;
     164       0 0 0 0 0 1]
     165  
     166      pp[] prior to operations:
     167       |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
     168  
     169      summation scheme for remaining operations:
     170       |______________5|n_____4|n_____3|n_____2|n______|n______|pp
     171       |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
     172  				    || H w4  | L w4  |
     173  		    || H w2  | L w2  |
     174  	    || H w1  | L w1  |
     175  			    ||-H w1  |-L w1  |
     176  		     |-H w0  |-L w0 ||-H w2  |-L w2  |
     177    */
     178    cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
     179    MPN_INCR_U (pp + 3 * n + 1, n, cy);
     180  
     181    /* W2 -= W0<<2 */
     182  #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
     183  #if HAVE_NATIVE_mpn_sublsh2_n_ip1
     184    cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
     185  #else
     186    cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
     187  #endif
     188  #else
     189    /* {W4,2*n+1} is now free and can be overwritten. */
     190    cy = mpn_lshift(w4, w0, w0n, 2);
     191    cy+= mpn_sub_n(w2, w2, w4, w0n);
     192  #endif
     193    MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);
     194  
     195    /* W4L = W4L - W2L */
     196    cy = mpn_sub_n (pp + n, pp + n, w2, n);
     197    MPN_DECR_U (w3, 2 * n + 1, cy);
     198  
     199    /* W3H = W3H + W2L */
     200    cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
     201    /* W1L + W2H */
     202    cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
     203    MPN_INCR_U (w1 + n, n + 1, cy);
     204  
     205    /* W0 = W0 + W1H */
     206    if (LIKELY (w0n > n))
     207      cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
     208    else
     209      cy6 = mpn_add_n (w0, w0, w1 + n, w0n);
     210  
     211    /*
     212      summation scheme for the next operation:
     213       |...____5|n_____4|n_____3|n_____2|n______|n______|pp
     214       |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
     215  		     ...-w0___|-w1_w2 |
     216    */
     217    /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
     218    cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);
     219  
     220    /* embankment is a "dirty trick" to avoid carry/borrow propagation
     221       beyond allocated memory */
     222    embankment = w0[w0n - 1] - 1;
     223    w0[w0n - 1] = 1;
     224    if (LIKELY (w0n > n)) {
     225      if (cy4 > cy6)
     226        MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
     227      else
     228        MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
     229      MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
     230      MPN_INCR_U (w0 + n, w0n - n, cy6);
     231    } else {
     232      MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
     233      MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
     234    }
     235    w0[w0n - 1] += embankment;
     236  
     237  #undef w5
     238  #undef w3
     239  #undef w0
     240  
     241  }