1  /* mpn_toom42_mulmid -- toom42 middle product
       2  
       3     Contributed by David Harvey.
       4  
       5     THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
       6     SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       7     GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       8  
       9  Copyright 2011 Free Software Foundation, Inc.
      10  
      11  This file is part of the GNU MP Library.
      12  
      13  The GNU MP Library is free software; you can redistribute it and/or modify
      14  it under the terms of either:
      15  
      16    * the GNU Lesser General Public License as published by the Free
      17      Software Foundation; either version 3 of the License, or (at your
      18      option) any later version.
      19  
      20  or
      21  
      22    * the GNU General Public License as published by the Free Software
      23      Foundation; either version 2 of the License, or (at your option) any
      24      later version.
      25  
      26  or both in parallel, as here.
      27  
      28  The GNU MP Library is distributed in the hope that it will be useful, but
      29  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      30  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      31  for more details.
      32  
      33  You should have received copies of the GNU General Public License and the
      34  GNU Lesser General Public License along with the GNU MP Library.  If not,
      35  see https://www.gnu.org/licenses/.  */
      36  
      37  
      38  #include "gmp-impl.h"
      39  
      40  
      41  
      42  /*
      43    Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
      44  
      45    Neither ap nor bp may overlap rp.
      46  
      47    Must have n >= 4.
      48  
      49    Amount of scratch space required is given by mpn_toom42_mulmid_itch().
      50  
      51    FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
      52    requirements should be clarified.
      53  */
      54  void
      55  mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
      56                     mp_ptr scratch)
      57  {
      58    mp_limb_t cy, e[12], zh, zl;
      59    mp_size_t m;
      60    int neg;
      61  
      62    ASSERT (n >= 4);
      63    ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
      64    ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
      65  
      66    ap += n & 1;   /* handle odd row and diagonal later */
      67    m = n / 2;
      68  
      69    /* (e0h:e0l) etc are correction terms, in 2's complement */
      70  #define e0l (e[0])
      71  #define e0h (e[1])
      72  #define e1l (e[2])
      73  #define e1h (e[3])
      74  #define e2l (e[4])
      75  #define e2h (e[5])
      76  #define e3l (e[6])
      77  #define e3h (e[7])
      78  #define e4l (e[8])
      79  #define e4h (e[9])
      80  #define e5l (e[10])
      81  #define e5h (e[11])
      82  
      83  #define s (scratch + 2)
      84  #define t (rp + m + 2)
      85  #define p0 rp
      86  #define p1 scratch
      87  #define p2 (rp + m)
      88  #define next_scratch (scratch + 3*m + 1)
      89  
      90    /*
      91              rp                            scratch
      92    |---------|-----------|    |---------|---------|----------|
      93    0         m         2m+2   0         m         2m        3m+1
      94              <----p2---->       <-------------s------------->
      95    <----p0----><---t---->     <----p1---->
      96    */
      97  
      98    /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
      99    cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
     100    cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
     101  		       bp + m, bp, m, cy);
     102    mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
     103  
     104    /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
     105    if (mpn_cmp (bp + m, bp, m) < 0)
     106      {
     107        ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
     108  				      ap + m - 1, ap + 2*m - 1, m, 0));
     109        neg = 1;
     110      }
     111    else
     112      {
     113        ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
     114  				      ap + m - 1, ap + 2*m - 1, m, 0));
     115        neg = 0;
     116      }
     117  
     118    /* recursive middle products. The picture is:
     119  
     120        b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
     121        ...       -   A   A   A   B   B   B   -   -   -   -
     122        b[m]      -   -   A   A   A   B   B   B   -   -   -
     123        b[m-1]    -   -   -   C   C   C   D   D   D   -   -
     124        ...       -   -   -   -   C   C   C   D   D   D   -
     125        b[0]      -   -   -   -   -   C   C   C   D   D   D
     126                 a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
     127    */
     128  
     129    if (m < MULMID_TOOM42_THRESHOLD)
     130      {
     131        /* A + B */
     132        mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
     133        /* accumulate high limbs of p0 into e1 */
     134        ADDC_LIMB (cy, e1l, e1l, p0[m]);
     135        e1h += p0[m + 1] + cy;
     136        /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
     137        mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
     138        /* C + D   (overwrites t) */
     139        mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
     140      }
     141    else
     142      {
     143        /* as above, but use toom42 instead */
     144        mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
     145        ADDC_LIMB (cy, e1l, e1l, p0[m]);
     146        e1h += p0[m + 1] + cy;
     147        mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
     148        mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
     149      }
     150  
     151    /* apply error terms */
     152  
     153    /* -e0 at rp[0] */
     154    SUBC_LIMB (cy, rp[0], rp[0], e0l);
     155    SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
     156    if (UNLIKELY (cy))
     157      {
     158        cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
     159        SUBC_LIMB (cy, e1l, e1l, cy);
     160        e1h -= cy;
     161      }
     162  
     163    /* z = e1 - e2 + high(p0) */
     164    SUBC_LIMB (cy, zl, e1l, e2l);
     165    zh = e1h - e2h - cy;
     166  
     167    /* z at rp[m] */
     168    ADDC_LIMB (cy, rp[m], rp[m], zl);
     169    zh = (zh + cy) & GMP_NUMB_MASK;
     170    ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
     171    cy -= (zh >> (GMP_NUMB_BITS - 1));
     172    if (UNLIKELY (cy))
     173      {
     174        if (cy == 1)
     175  	mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
     176        else /* cy == -1 */
     177  	mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
     178      }
     179  
     180    /* e3 at rp[2*m] */
     181    ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
     182    rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
     183  
     184    /* e4 at p1[0] */
     185    ADDC_LIMB (cy, p1[0], p1[0], e4l);
     186    ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
     187    if (UNLIKELY (cy))
     188      mpn_add_1 (p1 + 2, p1 + 2, m, 1);
     189  
     190    /* -e5 at p1[m] */
     191    SUBC_LIMB (cy, p1[m], p1[m], e5l);
     192    p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
     193  
     194    /* adjustment if p1 ends up negative */
     195    cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
     196  
     197    /* add (-1)^neg * (p1 - B^m * p1) to output */
     198    if (neg)
     199      {
     200        mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
     201        mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
     202        mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
     203      }
     204    else
     205      {
     206        mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
     207        mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
     208        mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
     209      }
     210  
     211    /* odd row and diagonal */
     212    if (n & 1)
     213      {
     214        /*
     215          Products marked E are already done. We need to do products marked O.
     216  
     217          OOOOO----
     218          -EEEEO---
     219          --EEEEO--
     220          ---EEEEO-
     221          ----EEEEO
     222         */
     223  
     224        /* first row of O's */
     225        cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
     226        ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
     227  
     228        /* O's on diagonal */
     229        /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
     230           that can handle the sum below. Currently we're relying on
     231           mulmid_basecase being pretty fast for a diagonal sum like this,
     232  	 which is true at least for the K8 asm version, but surely false
     233  	 for the generic version. */
     234        mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
     235        mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
     236      }
     237  }