1  /* Mulders' short product, square and division.
       2  
       3  Copyright 2005-2023 Free Software Foundation, Inc.
       4  Contributed by the AriC and Caramba projects, INRIA.
       5  
       6  This file is part of the GNU MPFR Library.
       7  
       8  The GNU MPFR Library is free software; you can redistribute it and/or modify
       9  it under the terms of the GNU Lesser General Public License as published by
      10  the Free Software Foundation; either version 3 of the License, or (at your
      11  option) any later version.
      12  
      13  The GNU MPFR Library is distributed in the hope that it will be useful, but
      14  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      15  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
      16  License for more details.
      17  
      18  You should have received a copy of the GNU Lesser General Public License
      19  along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
      20  https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
      21  51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
      22  
      23  /* References:
      24     [1] Short Division of Long Integers, David Harvey and Paul Zimmermann,
      25         Proceedings of the 20th Symposium on Computer Arithmetic (ARITH-20),
      26         July 25-27, 2011, pages 7-14.
      27  */
      28  
      29  #define MPFR_NEED_LONGLONG_H
      30  #include "mpfr-impl.h"
      31  
      32  /* Don't use MPFR_MULHIGH_SIZE since it is handled by tuneup */
      33  #ifdef MPFR_MULHIGH_TAB_SIZE
      34  static short mulhigh_ktab[MPFR_MULHIGH_TAB_SIZE];
      35  #else
      36  static short mulhigh_ktab[] = {MPFR_MULHIGH_TAB};
      37  #define MPFR_MULHIGH_TAB_SIZE (numberof_const (mulhigh_ktab))
      38  #endif
      39  
      40  /* Put in  rp[n..2n-1] an approximation of the n high limbs
      41     of {up, n} * {vp, n}. The error is less than n ulps of rp[n] (and the
      42     approximation is always less or equal to the truncated full product).
      43     Assume 2n limbs are allocated at rp.
      44  
      45     Implements Algorithm ShortMulNaive from [1].
      46  */
      47  static void
      48  mpfr_mulhigh_n_basecase (mpfr_limb_ptr rp, mpfr_limb_srcptr up,
      49                           mpfr_limb_srcptr vp, mp_size_t n)
      50  {
      51    mp_size_t i;
      52  
      53    rp += n - 1;
      54    umul_ppmm (rp[1], rp[0], up[n-1], vp[0]); /* we neglect up[0..n-2]*vp[0],
      55                                                 which is less than B^n */
      56    for (i = 1 ; i < n ; i++)
      57      /* here, we neglect up[0..n-i-2] * vp[i], which is less than B^n too */
      58      rp[i + 1] = mpn_addmul_1 (rp, up + (n - i - 1), i + 1, vp[i]);
      59    /* in total, we neglect less than n*B^n, i.e., n ulps of rp[n]. */
      60  }
      61  
      62  /* Put in  rp[n..2n-1] an approximation of the n high limbs
      63     of {np, n} * {mp, n}. The error is less than n ulps of rp[n] (and the
      64     approximation is always less or equal to the truncated full product).
      65  
      66     Implements Algorithm ShortMul from [1].
      67  */
      68  void
      69  mpfr_mulhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mpfr_limb_srcptr mp,
      70                  mp_size_t n)
      71  {
      72    mp_size_t k;
      73  
      74    MPFR_STAT_STATIC_ASSERT (MPFR_MULHIGH_TAB_SIZE >= 8); /* so that 3*(n/4) > n/2 */
      75    k = MPFR_LIKELY (n < MPFR_MULHIGH_TAB_SIZE) ? mulhigh_ktab[n] : 3*(n/4);
      76    /* Algorithm ShortMul from [1] requires k >= (n+3)/2, which translates
      77       into k >= (n+4)/2 in the C language. */
      78    MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
      79    if (k < 0)
      80      mpn_mul_basecase (rp, np, n, mp, n); /* result is exact, no error */
      81    else if (k == 0)
      82      mpfr_mulhigh_n_basecase (rp, np, mp, n); /* basecase error < n ulps */
      83    else if (n > MUL_FFT_THRESHOLD)
      84      mpn_mul_n (rp, np, mp, n); /* result is exact, no error */
      85    else
      86      {
      87        mp_size_t l = n - k;
      88        mp_limb_t cy;
      89  
      90        mpn_mul_n (rp + 2 * l, np + l, mp + l, k); /* fills rp[2l..2n-1] */
      91        mpfr_mulhigh_n (rp, np + k, mp, l);        /* fills rp[l-1..2l-1] */
      92        cy = mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      93        mpfr_mulhigh_n (rp, np, mp + k, l);        /* fills rp[l-1..2l-1] */
      94        cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
      95        mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
      96      }
      97  }
      98  
      99  #ifdef MPFR_SQRHIGH_TAB_SIZE
     100  static short sqrhigh_ktab[MPFR_SQRHIGH_TAB_SIZE];
     101  #else
     102  static short sqrhigh_ktab[] = {MPFR_SQRHIGH_TAB};
     103  #define MPFR_SQRHIGH_TAB_SIZE (numberof_const (sqrhigh_ktab))
     104  #endif
     105  
     106  /* Put in  rp[n..2n-1] an approximation of the n high limbs
     107     of {np, n}^2. The error is less than n ulps of rp[n]. */
     108  void
     109  mpfr_sqrhigh_n (mpfr_limb_ptr rp, mpfr_limb_srcptr np, mp_size_t n)
     110  {
     111    mp_size_t k;
     112  
     113    MPFR_STAT_STATIC_ASSERT (MPFR_SQRHIGH_TAB_SIZE > 2); /* ensures k < n */
     114    k = MPFR_LIKELY (n < MPFR_SQRHIGH_TAB_SIZE) ? sqrhigh_ktab[n]
     115      : (n+4)/2; /* ensures that k >= (n+3)/2 */
     116    MPFR_ASSERTD (k == -1 || k == 0 || (k >= (n+4)/2 && k < n));
     117    if (k < 0)
     118      /* we can't use mpn_sqr_basecase here, since it requires
     119         n <= SQR_KARATSUBA_THRESHOLD, where SQR_KARATSUBA_THRESHOLD
     120         is not exported by GMP */
     121      mpn_sqr (rp, np, n);
     122    else if (k == 0)
     123      mpfr_mulhigh_n_basecase (rp, np, np, n);
     124    else
     125      {
     126        mp_size_t l = n - k;
     127        mp_limb_t cy;
     128  
     129        mpn_sqr (rp + 2 * l, np + l, k);            /* fills rp[2l..2n-1] */
     130        mpfr_mulhigh_n (rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
     131        /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
     132        cy = mpn_lshift (rp + l - 1, rp + l - 1, l + 1, 1);
     133        cy += mpn_add_n (rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
     134        mpn_add_1 (rp + n + l, rp + n + l, k, cy); /* propagate carry */
     135      }
     136  }
     137  
     138  #ifdef MPFR_DIVHIGH_TAB_SIZE
     139  static short divhigh_ktab[MPFR_DIVHIGH_TAB_SIZE];
     140  #else
     141  static short divhigh_ktab[] = {MPFR_DIVHIGH_TAB};
     142  #define MPFR_DIVHIGH_TAB_SIZE (numberof_const (divhigh_ktab))
     143  #endif
     144  
     145  /* Put in Q={qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
     146     with the most significant limb of the quotient as return value (0 or 1).
     147     Assumes the most significant bit of D is set. Clobbers N.
     148  
     149     The approximate quotient Q satisfies - 2(n-1) < N/D - Q <= 4.
     150  
     151     Assumes n >= 2.
     152  */
     153  static mp_limb_t
     154  mpfr_divhigh_n_basecase (mpfr_limb_ptr qp, mpfr_limb_ptr np,
     155                           mpfr_limb_srcptr dp, mp_size_t n)
     156  {
     157    mp_limb_t qh, d1, d0, q2, q1, q0;
     158    mpfr_pi1_t dinv2;
     159  
     160    MPFR_ASSERTD(n >= 2);
     161  
     162    np += n;
     163  
     164    if ((qh = (mpn_cmp (np, dp, n) >= 0)))
     165      mpn_sub_n (np, np, dp, n);
     166  
     167    /* now {np, n} is less than D={dp, n}, which implies np[n-1] <= dp[n-1] */
     168  
     169    d1 = dp[n - 1];
     170  
     171    /* we assumed n >= 2 */
     172    d0 = dp[n - 2];
     173    invert_pi1 (dinv2, d1, d0);
     174    /* dinv2.inv32 = floor ((B^3 - 1) / (d0 + d1 B)) - B */
     175    while (n > 1)
     176      {
     177        /* Invariant: it remains to reduce n limbs from N (in addition to the
     178           initial low n limbs).
     179           Since n >= 2 here, necessarily we had n >= 2 initially, which means
     180           that in addition to the limb np[n-1] to reduce, we have at least 2
     181           extra limbs, thus accessing np[n-3] is valid. */
     182  
     183        /* Warning: we can have np[n-1]>d1 or (np[n-1]=d1 and np[n-2]>=d0) here,
     184           since we truncate the divisor at each step, but since {np,n} < D
     185           originally, the largest possible partial quotient is B-1. */
     186        if (MPFR_UNLIKELY(np[n-1] > d1 || (np[n-1] == d1 && np[n-2] >= d0)))
     187          q2 = MPFR_LIMB_MAX;
     188        else
     189          udiv_qr_3by2 (q2, q1, q0, np[n - 1], np[n - 2], np[n - 3],
     190                        d1, d0, dinv2.inv32);
     191        /* since q2 = floor((np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0)),
     192           we have q2 <= (np[n-1]*B^2+np[n-2]*B+np[n-3])/(d1*B+d0),
     193           thus np[n-1]*B^2+np[n-2]*B+np[n-3] >= q2*(d1*B+d0)
     194           and {np-1, n} >= q2*D - q2*B^(n-2) >= q2*D - B^(n-1)
     195           thus {np-1, n} - (q2-1)*D >= D - B^(n-1) >= 0
     196           which proves that at most one correction is needed */
     197        q0 = mpn_submul_1 (np - 1, dp, n, q2);
     198        if (MPFR_UNLIKELY(q0 > np[n - 1]))
     199          {
     200            mpn_add_n (np - 1, np - 1, dp, n);
     201            q2 --;
     202          }
     203        qp[--n] = q2;
     204        dp ++;
     205      }
     206  
     207    /* we have B+dinv2 = floor((B^3-1)/(d1*B+d0)) < B^2/d1
     208       q1 = floor(np[0]*(B+dinv2)/B) <= floor(np[0]*B/d1)
     209          <= floor((np[0]*B+np[1])/d1)
     210       thus q1 is not larger than the true quotient.
     211       q1 > np[0]*(B+dinv2)/B - 1 > np[0]*(B^3-1)/(d1*B+d0)/B - 2
     212       For d1*B+d0 <> B^2/2, we have B+dinv2 = floor(B^3/(d1*B+d0))
     213       thus q1 > np[0]*B^2/(d1*B+d0) - 2, i.e.,
     214       (d1*B+d0)*q1 > np[0]*B^2 - 2*(d1*B+d0)
     215       d1*B*q1 > np[0]*B^2 - 2*d1*B - 2*d0 - d0*q1 >= np[0]*B^2 - 2*d1*B - B^2
     216       thus q1 > np[0]*B/d1 - 2 - B/d1 > np[0]*B/d1 - 4.
     217  
     218       For d1*B+d0 = B^2/2, dinv2 = B-1 thus q1 > np[0]*(2B-1)/B - 1 >
     219       np[0]*B/d1 - 2.
     220  
     221       In all cases, if q = floor((np[0]*B+np[1])/d1), we have:
     222       q - 4 <= q1 <= q
     223    */
     224    umul_ppmm (q1, q0, np[0], dinv2.inv32);
     225    qp[0] = np[0] + q1;
     226  
     227    return qh;
     228  }
     229  
     230  /* Put in {qp, n} an approximation of N={np, 2*n} divided by D={dp, n},
     231     with the most significant limb of the quotient as return value (0 or 1).
     232     Assumes the most significant bit of D is set. Clobbers N.
     233  
     234     This implements the ShortDiv algorithm from reference [1].
     235  
     236     Assumes n >= 2 (which should be fulfilled also in the recursive calls).
     237  */
     238  mp_limb_t
     239  mpfr_divhigh_n (mpfr_limb_ptr qp, mpfr_limb_ptr np, mpfr_limb_ptr dp,
     240                  mp_size_t n)
     241  {
     242    mp_size_t k, l;
     243    mp_limb_t qh, cy;
     244    mpfr_limb_ptr tp;
     245    MPFR_TMP_DECL(marker);
     246  
     247    MPFR_STAT_STATIC_ASSERT (MPFR_DIVHIGH_TAB_SIZE >= 15); /* so that 2*(n/3) >= (n+4)/2 */
     248    MPFR_ASSERTD(n >= 2);
     249    k = MPFR_LIKELY (n < MPFR_DIVHIGH_TAB_SIZE) ? divhigh_ktab[n] : 2*(n/3);
     250  
     251    if (k == 0)
     252      {
     253  #if defined(WANT_GMP_INTERNALS) && defined(HAVE___GMPN_SBPI1_DIVAPPR_Q)
     254        mpfr_pi1_t dinv2;
     255        invert_pi1 (dinv2, dp[n - 1], dp[n - 2]);
     256        if (n > 2) /* sbpi1_divappr_q wants n > 2 */
     257          return __gmpn_sbpi1_divappr_q (qp, np, n + n, dp, n, dinv2.inv32);
     258        else
     259          return mpfr_divhigh_n_basecase (qp, np, dp, n);
     260  #else /* use our own code for base-case short division */
     261        return mpfr_divhigh_n_basecase (qp, np, dp, n);
     262  #endif
     263      }
     264  
     265    /* Check the bounds from [1]. In addition, we forbid k=n-1, which would
     266       give l=1 in the recursive call. It follows n >= 5. */
     267    MPFR_ASSERTD ((n+4)/2 <= k && k < n-1);
     268  
     269    MPFR_TMP_MARK (marker);
     270    l = n - k;
     271    /* first divide the most significant 2k limbs from N by the most significant
     272       k limbs of D */
     273    qh = mpn_divrem (qp + l, 0, np + 2 * l, 2 * k, dp + l, k); /* exact */
     274  
     275    /* it remains {np,2l+k} = {np,n+l} as remainder */
     276  
     277    /* now we have to subtract high(Q1)*D0 where Q1=qh*B^k+{qp+l,k} and
     278       D0={dp,l} */
     279    tp = MPFR_TMP_LIMBS_ALLOC (2 * l);
     280    mpfr_mulhigh_n (tp, qp + k, dp, l);
     281    /* we are only interested in the upper l limbs from {tp,2l} */
     282    cy = mpn_sub_n (np + n, np + n, tp + l, l);
     283    if (qh)
     284      cy += mpn_sub_n (np + n, np + n, dp, l);
     285    while (cy > 0) /* Q1 was too large: subtract 1 to Q1 and add D to np+l */
     286      {
     287        qh -= mpn_sub_1 (qp + l, qp + l, k, MPFR_LIMB_ONE);
     288        cy -= mpn_add_n (np + l, np + l, dp, n);
     289      }
     290  
     291    /* now it remains {np,n+l} to divide by D */
     292    cy = mpfr_divhigh_n (qp, np + k, dp + k, l);
     293    qh += mpn_add_1 (qp + l, qp + l, k, cy);
     294    MPFR_TMP_FREE(marker);
     295  
     296    return qh;
     297  }