1  /* Interpolation for the algorithm Toom-Cook 6.5-way.
       2  
       3     Contributed to the GNU project by Marco Bodrato.
       4  
       5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       7     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       8  
       9  Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
      10  
      11  This file is part of the GNU MP Library.
      12  
      13  The GNU MP Library is free software; you can redistribute it and/or modify
      14  it under the terms of either:
      15  
      16    * the GNU Lesser General Public License as published by the Free
      17      Software Foundation; either version 3 of the License, or (at your
      18      option) any later version.
      19  
      20  or
      21  
      22    * the GNU General Public License as published by the Free Software
      23      Foundation; either version 2 of the License, or (at your option) any
      24      later version.
      25  
      26  or both in parallel, as here.
      27  
      28  The GNU MP Library is distributed in the hope that it will be useful, but
      29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      31  for more details.
      32  
      33  You should have received copies of the GNU General Public License and the
      34  GNU Lesser General Public License along with the GNU MP Library.  If not,
      35  see https://www.gnu.org/licenses/.  */
      36  
      37  
      38  #include "gmp-impl.h"
      39  
      40  
      41  #if GMP_NUMB_BITS < 21
      42  #error Not implemented: Both sublsh_n(,,,20) should be corrected.
      43  #endif
      44  
      45  #if GMP_NUMB_BITS < 16
      46  #error Not implemented: divexact_by42525 needs splitting.
      47  #endif
      48  
      49  #if GMP_NUMB_BITS < 12
      50  #error Not implemented: Hard to adapt...
      51  #endif
      52  
      53  
      54  /* FIXME: tuneup should decide the best variant */
      55  #ifndef AORSMUL_FASTER_AORS_AORSLSH
      56  #define AORSMUL_FASTER_AORS_AORSLSH 1
      57  #endif
      58  #ifndef AORSMUL_FASTER_AORS_2AORSLSH
      59  #define AORSMUL_FASTER_AORS_2AORSLSH 1
      60  #endif
      61  #ifndef AORSMUL_FASTER_2AORSLSH
      62  #define AORSMUL_FASTER_2AORSLSH 1
      63  #endif
      64  #ifndef AORSMUL_FASTER_3AORSLSH
      65  #define AORSMUL_FASTER_3AORSLSH 1
      66  #endif
      67  
      68  
      69  #if HAVE_NATIVE_mpn_sublsh_n
      70  #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
      71  #else
      72  static mp_limb_t
      73  DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
      74  {
      75  #if USE_MUL_1 && 0
      76    return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
      77  #else
      78    mp_limb_t __cy;
      79    __cy = mpn_lshift(ws,src,n,s);
      80    return    __cy + mpn_sub_n(dst,dst,ws,n);
      81  #endif
      82  }
      83  #endif
      84  
      85  #if HAVE_NATIVE_mpn_addlsh_n
      86  #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
      87  #else
      88  #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
      89  static mp_limb_t
      90  DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
      91  {
      92  #if USE_MUL_1 && 0
      93    return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
      94  #else
      95    mp_limb_t __cy;
      96    __cy = mpn_lshift(ws,src,n,s);
      97    return    __cy + mpn_add_n(dst,dst,ws,n);
      98  #endif
      99  }
     100  #endif
     101  #endif
     102  
     103  #if HAVE_NATIVE_mpn_subrsh
     104  #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
     105  #else
     106  /* FIXME: This is not a correct definition, it assumes no carry */
     107  #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
     108  do {									\
     109    mp_limb_t __cy;							\
     110    MPN_DECR_U (dst, nd, src[0] >> s);					\
     111    __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
     112    MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
     113  } while (0)
     114  #endif
     115  
     116  
     117  #define BINVERT_9 \
     118    ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
     119  
     120  #define BINVERT_255 \
     121    (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
     122  
     123    /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
     124  #if GMP_LIMB_BITS == 32
     125  #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
     126  #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
     127  #else
     128  #if GMP_LIMB_BITS == 64
     129  #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
     130  #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
     131  #endif
     132  #endif
     133  
     134  #ifndef mpn_divexact_by255
     135  #if GMP_NUMB_BITS % 8 == 0
     136  #define mpn_divexact_by255(dst,src,size) \
     137    (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
     138  #else
     139  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
     140  #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
     141  #else
     142  #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
     143  #endif
     144  #endif
     145  #endif
     146  
     147  #ifndef mpn_divexact_by9x4
     148  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
     149  #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
     150  #else
     151  #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
     152  #endif
     153  #endif
     154  
     155  #ifndef mpn_divexact_by42525
     156  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
     157  #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
     158  #else
     159  #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
     160  #endif
     161  #endif
     162  
     163  #ifndef mpn_divexact_by2835x4
     164  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
     165  #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
     166  #else
     167  #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
     168  #endif
     169  #endif
     170  
     171  /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
     172     points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
     173     we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
     174     degree 11 (or 10), given the 12 (rsp. 11) values:
     175  
     176       r0 = limit at infinity of f(x) / x^11,
     177       r1 = f(4),f(-4),
     178       r2 = f(2),f(-2),
     179       r3 = f(1),f(-1),
     180       r4 = f(1/4),f(-1/4),
     181       r5 = f(1/2),f(-1/2),
     182       r6 = f(0).
     183  
     184     All couples of the form f(n),f(-n) must be already mixed with
     185     toom_couple_handling(f(n),...,f(-n),...)
     186  
     187     The result is stored in {pp, spt + 7*n (or 6*n)}.
     188     At entry, r6 is stored at {pp, 2n},
     189     r4 is stored at {pp + 3n, 3n + 1}.
     190     r2 is stored at {pp + 7n, 3n + 1}.
     191     r0 is stored at {pp +11n, spt}.
     192  
     193     The other values are 3n+1 limbs each (with most significant limbs small).
     194  
     195     Negative intermediate results are stored two-complemented.
     196     Inputs are destroyed.
     197  */
     198  
     199  void
     200  mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
     201  			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
     202  {
     203    mp_limb_t cy;
     204    mp_size_t n3;
     205    mp_size_t n3p1;
     206    n3 = 3 * n;
     207    n3p1 = n3 + 1;
     208  
     209  #define   r4    (pp + n3)			/* 3n+1 */
     210  #define   r2    (pp + 7 * n)			/* 3n+1 */
     211  #define   r0    (pp +11 * n)			/* s+t <= 2*n */
     212  
     213    /******************************* interpolation *****************************/
     214    if (half != 0) {
     215      cy = mpn_sub_n (r3, r3, r0, spt);
     216      MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
     217  
     218      cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
     219      MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
     220      DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
     221  
     222      cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
     223      MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
     224      DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
     225    };
     226  
     227    r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
     228    DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
     229  
     230  #if HAVE_NATIVE_mpn_add_n_sub_n
     231    mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
     232  #else
     233    ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
     234    mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
     235    MP_PTR_SWAP(r1, wsi);
     236  #endif
     237  
     238    r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
     239    DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
     240  
     241  #if HAVE_NATIVE_mpn_add_n_sub_n
     242    mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
     243  #else
     244    mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
     245    ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
     246    MP_PTR_SWAP(r5, wsi);
     247  #endif
     248  
     249    r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
     250  
     251  #if AORSMUL_FASTER_AORS_AORSLSH
     252    mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
     253  #else
     254    mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
     255    DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
     256  #endif
     257    /* A division by 2835x4 follows. Warning: the operand can be negative! */
     258    mpn_divexact_by2835x4(r4, r4, n3p1);
     259    if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
     260      r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
     261  
     262  #if AORSMUL_FASTER_2AORSLSH
     263    mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
     264  #else
     265    DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
     266    DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
     267  #endif
     268    mpn_divexact_by255(r5, r5, n3p1);
     269  
     270    ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
     271  
     272  #if AORSMUL_FASTER_3AORSLSH
     273    ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
     274  #else
     275    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
     276    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
     277    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
     278  #endif
     279    ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
     280    mpn_divexact_by42525(r1, r1, n3p1);
     281  
     282  #if AORSMUL_FASTER_AORS_2AORSLSH
     283    ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
     284  #else
     285    ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
     286    ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
     287    ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
     288  #endif
     289    mpn_divexact_by9x4(r2, r2, n3p1);
     290  
     291    ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
     292  
     293  #ifdef HAVE_NATIVE_mpn_rsh1sub_n
     294    mpn_rsh1sub_n (r4, r2, r4, n3p1);
     295    r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
     296  #else
     297    mpn_sub_n (r4, r2, r4, n3p1);
     298    ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
     299  #endif
     300    ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
     301  
     302  #ifdef HAVE_NATIVE_mpn_rsh1add_n
     303    mpn_rsh1add_n (r5, r5, r1, n3p1);
     304    r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
     305  #else
     306    mpn_add_n (r5, r5, r1, n3p1);
     307    ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
     308  #endif
     309  
     310    /* last interpolation steps... */
     311    ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
     312    ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
     313    /* ... could be mixed with recomposition
     314  	||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
     315    */
     316  
     317    /***************************** recomposition *******************************/
     318    /*
     319      pp[] prior to operations:
     320      |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
     321  
     322      summation scheme for remaining operations:
     323      |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
     324      |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
     325  	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
     326    */
     327  
     328    cy = mpn_add_n (pp + n, pp + n, r5, n);
     329    cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
     330  #if HAVE_NATIVE_mpn_add_nc
     331    cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
     332  #else
     333    MPN_INCR_U (r5 + 2 * n, n + 1, cy);
     334    cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
     335  #endif
     336    MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
     337  
     338    pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
     339    cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
     340  #if HAVE_NATIVE_mpn_add_nc
     341    cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
     342  #else
     343    MPN_INCR_U (r3 + 2 * n, n + 1, cy);
     344    cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
     345  #endif
     346    MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
     347  
     348    pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
     349    if (half) {
     350      cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
     351  #if HAVE_NATIVE_mpn_add_nc
     352      if (LIKELY (spt > n)) {
     353        cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
     354        MPN_INCR_U (pp + 4 * n3, spt - n, cy);
     355      } else {
     356        ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
     357      }
     358  #else
     359      MPN_INCR_U (r1 + 2 * n, n + 1, cy);
     360      if (LIKELY (spt > n)) {
     361        cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
     362        MPN_INCR_U (pp + 4 * n3, spt - n, cy);
     363      } else {
     364        ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
     365      }
     366  #endif
     367    } else {
     368      ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
     369    }
     370  
     371  #undef   r0
     372  #undef   r2
     373  #undef   r4
     374  }