1  /* Interpolation for the algorithm Toom-Cook 8.5-way.
       2  
       3     Contributed to the GNU project by Marco Bodrato.
       4  
       5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       7     GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       8  
       9  Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc.
      10  
      11  This file is part of the GNU MP Library.
      12  
      13  The GNU MP Library is free software; you can redistribute it and/or modify
      14  it under the terms of either:
      15  
      16    * the GNU Lesser General Public License as published by the Free
      17      Software Foundation; either version 3 of the License, or (at your
      18      option) any later version.
      19  
      20  or
      21  
      22    * the GNU General Public License as published by the Free Software
      23      Foundation; either version 2 of the License, or (at your option) any
      24      later version.
      25  
      26  or both in parallel, as here.
      27  
      28  The GNU MP Library is distributed in the hope that it will be useful, but
      29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      31  for more details.
      32  
      33  You should have received copies of the GNU General Public License and the
      34  GNU Lesser General Public License along with the GNU MP Library.  If not,
      35  see https://www.gnu.org/licenses/.  */
      36  
      37  
      38  #include "gmp-impl.h"
      39  
      40  
      41  #if GMP_NUMB_BITS < 29
      42  #error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
      43  #endif
      44  
      45  #if GMP_NUMB_BITS < 28
      46  #error Not implemented: divexact_by188513325 and _by182712915 will not work.
      47  #endif
      48  
      49  
      50  /* FIXME: tuneup should decide the best variant */
      51  #ifndef AORSMUL_FASTER_AORS_AORSLSH
      52  #define AORSMUL_FASTER_AORS_AORSLSH 1
      53  #endif
      54  #ifndef AORSMUL_FASTER_AORS_2AORSLSH
      55  #define AORSMUL_FASTER_AORS_2AORSLSH 1
      56  #endif
      57  #ifndef AORSMUL_FASTER_2AORSLSH
      58  #define AORSMUL_FASTER_2AORSLSH 1
      59  #endif
      60  #ifndef AORSMUL_FASTER_3AORSLSH
      61  #define AORSMUL_FASTER_3AORSLSH 1
      62  #endif
      63  
      64  
      65  #if HAVE_NATIVE_mpn_sublsh_n
      66  #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
      67  #else
      68  static mp_limb_t
      69  DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
      70  {
      71  #if USE_MUL_1 && 0
      72    return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
      73  #else
      74    mp_limb_t __cy;
      75    __cy = mpn_lshift(ws,src,n,s);
      76    return    __cy + mpn_sub_n(dst,dst,ws,n);
      77  #endif
      78  }
      79  #endif
      80  
      81  #if HAVE_NATIVE_mpn_addlsh_n
      82  #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
      83  #else
      84  #if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH)
      85  static mp_limb_t
      86  DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
      87  {
      88  #if USE_MUL_1 && 0
      89    return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
      90  #else
      91    mp_limb_t __cy;
      92    __cy = mpn_lshift(ws,src,n,s);
      93    return    __cy + mpn_add_n(dst,dst,ws,n);
      94  #endif
      95  }
      96  #endif
      97  #endif
      98  
      99  #if HAVE_NATIVE_mpn_subrsh
     100  #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
     101  #else
     102  /* FIXME: This is not a correct definition, it assumes no carry */
     103  #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
     104  do {									\
     105    mp_limb_t __cy;							\
     106    MPN_DECR_U (dst, nd, src[0] >> s);					\
     107    __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
     108    MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
     109  } while (0)
     110  #endif
     111  
     112  
     113  #if GMP_NUMB_BITS < 43
     114  #define BIT_CORRECTION 1
     115  #define CORRECTION_BITS GMP_NUMB_BITS
     116  #else
     117  #define BIT_CORRECTION 0
     118  #define CORRECTION_BITS 0
     119  #endif
     120  
     121  #define BINVERT_9 \
     122    ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
     123  
     124  #define BINVERT_255 \
     125    (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
     126  
     127    /* FIXME: find some more general expressions for inverses */
     128  #if GMP_LIMB_BITS == 32
     129  #define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
     130  #define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
     131  #define BINVERT_182712915 (GMP_NUMB_MASK &	CNST_LIMB(0x550659DB))
     132  #define BINVERT_188513325 (GMP_NUMB_MASK &	CNST_LIMB(0xFBC333A5))
     133  #define BINVERT_255x182712915L (GMP_NUMB_MASK &	CNST_LIMB(0x6FC4CB25))
     134  #define BINVERT_255x188513325L (GMP_NUMB_MASK &	CNST_LIMB(0x6864275B))
     135  #if GMP_NAIL_BITS == 0
     136  #define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
     137  #define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
     138  #else /* GMP_NAIL_BITS != 0 */
     139  #define BINVERT_255x182712915H \
     140    (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
     141  #define BINVERT_255x188513325H \
     142    (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
     143  #endif
     144  #else
     145  #if GMP_LIMB_BITS == 64
     146  #define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
     147  #define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
     148  #define BINVERT_255x182712915  (GMP_NUMB_MASK &	CNST_LIMB(0x1B649A076FC4CB25))
     149  #define BINVERT_255x188513325  (GMP_NUMB_MASK &	CNST_LIMB(0x06DB993A6864275B))
     150  #endif
     151  #endif
     152  
     153  #ifndef mpn_divexact_by255
     154  #if GMP_NUMB_BITS % 8 == 0
     155  #define mpn_divexact_by255(dst,src,size) \
     156    (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
     157  #else
     158  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
     159  #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
     160  #else
     161  #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
     162  #endif
     163  #endif
     164  #endif
     165  
     166  #ifndef mpn_divexact_by255x4
     167  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
     168  #define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
     169  #else
     170  #define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
     171  #endif
     172  #endif
     173  
     174  #ifndef mpn_divexact_by9x16
     175  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
     176  #define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
     177  #else
     178  #define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
     179  #endif
     180  #endif
     181  
     182  #ifndef mpn_divexact_by42525x16
     183  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
     184  #define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
     185  #else
     186  #define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
     187  #endif
     188  #endif
     189  
     190  #ifndef mpn_divexact_by2835x64
     191  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
     192  #define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
     193  #else
     194  #define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
     195  #endif
     196  #endif
     197  
     198  #ifndef  mpn_divexact_by255x182712915
     199  #if GMP_NUMB_BITS < 36
     200  #if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
     201  /* FIXME: use mpn_bdiv_q_2_pi2 */
     202  #endif
     203  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
     204  #define mpn_divexact_by255x182712915(dst,src,size)				\
     205    do {										\
     206      mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);	\
     207      mpn_divexact_by255(dst,dst,size);						\
     208    } while(0)
     209  #else
     210  #define mpn_divexact_by255x182712915(dst,src,size)	\
     211    do {							\
     212      mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));	\
     213      mpn_divexact_by255(dst,dst,size);			\
     214    } while(0)
     215  #endif
     216  #else /* GMP_NUMB_BITS > 35 */
     217  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
     218  #define mpn_divexact_by255x182712915(dst,src,size) \
     219    mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
     220  #else
     221  #define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
     222  #endif
     223  #endif /* GMP_NUMB_BITS >?< 36 */
     224  #endif
     225  
     226  #ifndef  mpn_divexact_by255x188513325
     227  #if GMP_NUMB_BITS < 36
     228  #if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
     229  /* FIXME: use mpn_bdiv_q_1_pi2 */
     230  #endif
     231  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
     232  #define mpn_divexact_by255x188513325(dst,src,size)			\
     233    do {									\
     234      mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);	\
     235      mpn_divexact_by255(dst,dst,size);					\
     236    } while(0)
     237  #else
     238  #define mpn_divexact_by255x188513325(dst,src,size)	\
     239    do {							\
     240      mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));	\
     241      mpn_divexact_by255(dst,dst,size);			\
     242    } while(0)
     243  #endif
     244  #else /* GMP_NUMB_BITS > 35 */
     245  #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
     246  #define mpn_divexact_by255x188513325(dst,src,size) \
     247    mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
     248  #else
     249  #define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
     250  #endif
     251  #endif /* GMP_NUMB_BITS >?< 36 */
     252  #endif
     253  
     254  /* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
     255     points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
     256     +-1/8, 0. More precisely, we want to compute
     257     f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
     258     14), given the 16 (rsp. 15) values:
     259  
     260       r0 = limit at infinity of f(x) / x^15,
     261       r1 = f(8),f(-8),
     262       r2 = f(4),f(-4),
     263       r3 = f(2),f(-2),
     264       r4 = f(1),f(-1),
     265       r5 = f(1/4),f(-1/4),
     266       r6 = f(1/2),f(-1/2),
     267       r7 = f(1/8),f(-1/8),
     268       r8 = f(0).
     269  
     270     All couples of the form f(n),f(-n) must be already mixed with
     271     toom_couple_handling(f(n),...,f(-n),...)
     272  
     273     The result is stored in {pp, spt + 7*n (or 8*n)}.
     274     At entry, r8 is stored at {pp, 2n},
     275     r6 is stored at {pp + 3n, 3n + 1}.
     276     r4 is stored at {pp + 7n, 3n + 1}.
     277     r2 is stored at {pp +11n, 3n + 1}.
     278     r0 is stored at {pp +15n, spt}.
     279  
     280     The other values are 3n+1 limbs each (with most significant limbs small).
     281  
     282     Negative intermediate results are stored two-complemented.
     283     Inputs are destroyed.
     284  */
     285  
     286  void
     287  mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
     288  			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
     289  {
     290    mp_limb_t cy;
     291    mp_size_t n3;
     292    mp_size_t n3p1;
     293    n3 = 3 * n;
     294    n3p1 = n3 + 1;
     295  
     296  #define   r6    (pp + n3)			/* 3n+1 */
     297  #define   r4    (pp + 7 * n)			/* 3n+1 */
     298  #define   r2    (pp +11 * n)			/* 3n+1 */
     299  #define   r0    (pp +15 * n)			/* s+t <= 2*n */
     300  
     301    ASSERT( spt <= 2 * n );
     302    /******************************* interpolation *****************************/
     303    if( half != 0) {
     304      cy = mpn_sub_n (r4, r4, r0, spt);
     305      MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
     306  
     307      cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
     308      MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
     309      DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
     310  
     311      cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
     312      MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
     313      DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
     314  
     315      cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
     316  #if BIT_CORRECTION
     317      cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION,
     318  		    n3p1 - spt - BIT_CORRECTION, cy);
     319      ASSERT (BIT_CORRECTION > 0 || cy == 0);
     320      /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
     321      cy = r7[n3p1];
     322      r7[n3p1] = 0x80;
     323  #else
     324      MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
     325  #endif
     326      DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
     327  #if BIT_CORRECTION
     328      /* FIXME: assumes r7[n3p1] is writable. */
     329      ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
     330      r7[n3p1] = cy;
     331  #endif
     332    };
     333  
     334    r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
     335    DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
     336  
     337  #if HAVE_NATIVE_mpn_add_n_sub_n
     338    mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
     339  #else
     340    mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
     341    ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
     342    MP_PTR_SWAP(r5, wsi);
     343  #endif
     344  
     345    r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
     346    DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
     347  
     348  #if HAVE_NATIVE_mpn_add_n_sub_n
     349    mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
     350  #else
     351    ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
     352    mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
     353    MP_PTR_SWAP(r3, wsi);
     354  #endif
     355  
     356    cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
     357  #if BIT_CORRECTION
     358    MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
     359    cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
     360    cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
     361    ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
     362  #else
     363    r7[n3] -= cy;
     364    DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
     365  #endif
     366  
     367  #if HAVE_NATIVE_mpn_add_n_sub_n
     368    mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
     369  #else
     370    mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
     371    mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
     372    MP_PTR_SWAP(r7, wsi);
     373  #endif
     374  
     375    r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
     376  
     377  #if AORSMUL_FASTER_2AORSLSH
     378    mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
     379  #else
     380    DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
     381    DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
     382  #endif
     383  
     384    mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
     385  #if AORSMUL_FASTER_3AORSLSH
     386    mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
     387  #else
     388    DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
     389    DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
     390    DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
     391  #endif
     392    mpn_divexact_by255x188513325(r7, r7, n3p1);
     393  
     394    mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
     395    /* A division by 2835x64 follows. Warning: the operand can be negative! */
     396    mpn_divexact_by2835x64(r5, r5, n3p1);
     397    if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
     398      r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
     399  
     400  #if AORSMUL_FASTER_AORS_AORSLSH
     401    mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
     402  #else
     403    mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
     404    DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
     405  #endif
     406  #if AORSMUL_FASTER_2AORSLSH
     407    mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
     408  #else
     409    DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
     410    DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
     411  #endif
     412    /* A division by 255x4 follows. Warning: the operand can be negative! */
     413    mpn_divexact_by255x4(r6, r6, n3p1);
     414    if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
     415      r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
     416  
     417    ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
     418  
     419    ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
     420    ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
     421  
     422    /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
     423    DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
     424    mpn_submul_1 (r1, r2, n3p1, 1428);
     425    mpn_submul_1 (r1, r3, n3p1, 112896);
     426    mpn_divexact_by255x182712915(r1, r1, n3p1);
     427  
     428    ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
     429    mpn_divexact_by42525x16(r2, r2, n3p1);
     430  
     431  #if AORSMUL_FASTER_AORS_2AORSLSH
     432    ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
     433  #else
     434    ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
     435    ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
     436    ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
     437  #endif
     438    ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
     439    mpn_divexact_by9x16(r3, r3, n3p1);
     440  
     441    ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
     442    ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
     443    ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
     444  
     445  #ifdef HAVE_NATIVE_mpn_rsh1add_n
     446    mpn_rsh1add_n (r6, r2, r6, n3p1);
     447    r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
     448  #else
     449    mpn_add_n (r6, r2, r6, n3p1);
     450    ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
     451  #endif
     452    ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
     453  
     454  #ifdef HAVE_NATIVE_mpn_rsh1sub_n
     455    mpn_rsh1sub_n (r5, r3, r5, n3p1);
     456    r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
     457  #else
     458    mpn_sub_n (r5, r3, r5, n3p1);
     459    ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
     460  #endif
     461    ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
     462  
     463  #ifdef HAVE_NATIVE_mpn_rsh1add_n
     464    mpn_rsh1add_n (r7, r1, r7, n3p1);
     465    r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1;
     466  #else
     467    mpn_add_n (r7, r1, r7, n3p1);
     468    ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
     469  #endif
     470    ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
     471  
     472    /* last interpolation steps... */
     473    /* ... could be mixed with recomposition
     474  	||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
     475    */
     476  
     477    /***************************** recomposition *******************************/
     478    /*
     479      pp[] prior to operations:
     480      |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
     481  
     482      summation scheme for remaining operations:
     483      |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
     484      |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
     485  	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
     486    */
     487  
     488    cy = mpn_add_n (pp + n, pp + n, r7, n);
     489    cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
     490  #if HAVE_NATIVE_mpn_add_nc
     491    cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
     492  #else
     493    MPN_INCR_U (r7 + 2 * n, n + 1, cy);
     494    cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
     495  #endif
     496    MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
     497  
     498    pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
     499    cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
     500  #if HAVE_NATIVE_mpn_add_nc
     501    cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
     502  #else
     503    MPN_INCR_U (r5 + 2 * n, n + 1, cy);
     504    cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
     505  #endif
     506    MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
     507  
     508    pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
     509    cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
     510  #if HAVE_NATIVE_mpn_add_nc
     511    cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
     512  #else
     513    MPN_INCR_U (r3 + 2 * n, n + 1, cy);
     514    cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
     515  #endif
     516    MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
     517  
     518    pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
     519    if ( half ) {
     520      cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
     521  #if HAVE_NATIVE_mpn_add_nc
     522      if(LIKELY(spt > n)) {
     523        cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
     524        MPN_INCR_U (pp + 16 * n, spt - n, cy);
     525      } else {
     526        ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
     527      }
     528  #else
     529      MPN_INCR_U (r1 + 2 * n, n + 1, cy);
     530      if(LIKELY(spt > n)) {
     531        cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
     532        MPN_INCR_U (pp + 16 * n, spt - n, cy);
     533      } else {
     534        ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
     535      }
     536  #endif
     537    } else {
     538      ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
     539    }
     540  
     541  #undef   r0
     542  #undef   r2
     543  #undef   r4
     544  #undef   r6
     545  }