1  /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
       2     quotient.  The divisor is two limbs.
       3  
       4     Contributed to the GNU project by Torbjorn Granlund and Niels Möller
       5  
       6     THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
       7     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       8     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       9  
      10  
      11  Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc.
      12  
      13  This file is part of the GNU MP Library.
      14  
      15  The GNU MP Library is free software; you can redistribute it and/or modify
      16  it under the terms of either:
      17  
      18    * the GNU Lesser General Public License as published by the Free
      19      Software Foundation; either version 3 of the License, or (at your
      20      option) any later version.
      21  
      22  or
      23  
      24    * the GNU General Public License as published by the Free Software
      25      Foundation; either version 2 of the License, or (at your option) any
      26      later version.
      27  
      28  or both in parallel, as here.
      29  
      30  The GNU MP Library is distributed in the hope that it will be useful, but
      31  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      32  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      33  for more details.
      34  
      35  You should have received copies of the GNU General Public License and the
      36  GNU Lesser General Public License along with the GNU MP Library.  If not,
      37  see https://www.gnu.org/licenses/.  */
      38  
      39  #include "gmp-impl.h"
      40  #include "longlong.h"
      41  
      42  #ifndef DIV_QR_2_PI2_THRESHOLD
      43  /* Disabled unless explicitly tuned. */
      44  #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
      45  #endif
      46  
      47  #ifndef SANITY_CHECK
      48  #define SANITY_CHECK 0
      49  #endif
      50  
      51  /* Define some longlong.h-style macros, but for wider operations.
      52     * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
      53       an additional sum operand.
      54     * add_csaac accepts two addends and a carry in, and generates a sum and a
      55       carry out.  A little like a "full adder".
      56  */
      57  #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
      58  
      59  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
      60  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      61    __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
      62  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      63  	   : "0"  ((USItype)(s2)),					\
      64  	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
      65  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
      66  #endif
      67  
      68  #if defined (__amd64__) && W_TYPE_SIZE == 64
      69  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      70    __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
      71  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      72  	   : "0"  ((UDItype)(s2)),					\
      73  	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
      74  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
      75  #endif
      76  
      77  #if defined (__aarch64__) && W_TYPE_SIZE == 64
      78  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      79    __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\
      80  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      81  	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
      82  	     __CLOBBER_CC)
      83  #endif
      84  
      85  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
      86  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
      87     processor running in 32-bit mode, since the carry flag then gets the 32-bit
      88     carry.  */
      89  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      90    __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
      91  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      92  	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
      93  	     __CLOBBER_CC)
      94  #endif
      95  
      96  #endif /* __GNUC__ */
      97  
      98  #ifndef add_sssaaaa
      99  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
     100    do {									\
     101      UWtype __s0, __s1, __c0, __c1;					\
     102      __s0 = (a0) + (b0);							\
     103      __s1 = (a1) + (b1);							\
     104      __c0 = __s0 < (a0);							\
     105      __c1 = __s1 < (a1);							\
     106      (s0) = __s0;							\
     107      __s1 = __s1 + __c0;							\
     108      (s1) = __s1;							\
     109      (s2) += __c1 + (__s1 < __c0);					\
     110    } while (0)
     111  #endif
     112  
     113  /* Typically used with r1, r0 same as n3, n2. Other types of overlap
     114     between inputs and outputs are not supported. */
     115  #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)		\
     116    do {									\
     117      mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;		\
     118      mp_limb_t _t1, _t0;							\
     119      mp_limb_t _mask;							\
     120  									\
     121      /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */	\
     122      umul_ppmm (_q2,_q1, n2, di1);					\
     123      umul_ppmm (_q3,_q2a, n3, di1);					\
     124      ++_q2;	/* _q2 cannot overflow */				\
     125      add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a);				\
     126      umul_ppmm (_q2c,_q1c, n3, di0);					\
     127      add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c);			\
     128      umul_ppmm (_q1d,_q0, n2, di0);					\
     129      add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \
     130      add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d);			\
     131  									\
     132      umul_ppmm (_t1,_t0, _q2, d0);					\
     133      _t1 += _q2 * d1 + _q3 * d0;						\
     134  									\
     135      sub_ddmmss (r1, r0, n1, n0, _t1, _t0);				\
     136  									\
     137      _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
     138      add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);		\
     139      sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);		\
     140  									\
     141      if (UNLIKELY (r1 >= d1))						\
     142        {									\
     143  	if (r1 > d1 || r0 >= d0)					\
     144  	  {								\
     145  	    sub_ddmmss (r1, r0, r1, r0, d1, d0);			\
     146  	    add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
     147  	  }								\
     148        }									\
     149      (q1) = _q3;								\
     150      (q0) = _q2;								\
     151    } while (0)
     152  
     153  static void
     154  invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
     155  {
     156    mp_limb_t v1, v0, p1, t1, t0, p0, mask;
     157    invert_limb (v1, d1);
     158    p1 = d1 * v1;
     159    /* <1, v1> * d1 = <B-1, p1> */
     160    p1 += d0;
     161    if (p1 < d0)
     162      {
     163        v1--;
     164        mask = -(mp_limb_t) (p1 >= d1);
     165        p1 -= d1;
     166        v1 += mask;
     167        p1 -= mask & d1;
     168      }
     169    /* <1, v1> * d1 + d0 = <B-1, p1> */
     170    umul_ppmm (t1, p0, d0, v1);
     171    p1 += t1;
     172    if (p1 < t1)
     173      {
     174        if (UNLIKELY (p1 >= d1))
     175  	{
     176  	  if (p1 > d1 || p0 >= d0)
     177  	    {
     178  	      sub_ddmmss (p1, p0, p1, p0, d1, d0);
     179  	      v1--;
     180  	    }
     181  	}
     182        sub_ddmmss (p1, p0, p1, p0, d1, d0);
     183        v1--;
     184      }
     185    /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
     186     * with <p1, p0> + <d1, d0> >= B^2.
     187     *
     188     * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
     189     * partial remainder after <1, v1> is
     190     *
     191     * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
     192     *                              = <~p1, ~p0, B-1>
     193     */
     194    udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
     195    di[0] = v0;
     196    di[1] = v1;
     197  
     198  #if SANITY_CHECK
     199    {
     200      mp_limb_t tp[4];
     201      mp_limb_t dp[2];
     202      dp[0] = d0;
     203      dp[1] = d1;
     204      mpn_mul_n (tp, dp, di, 2);
     205      ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
     206      ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
     207      ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
     208      ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
     209    }
     210  #endif
     211  }
     212  
     213  static mp_limb_t
     214  mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
     215  		   mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
     216  {
     217    mp_limb_t qh;
     218    mp_size_t i;
     219    mp_limb_t r1, r0;
     220  
     221    ASSERT (nn >= 2);
     222    ASSERT (d1 & GMP_NUMB_HIGHBIT);
     223  
     224    r1 = np[nn-1];
     225    r0 = np[nn-2];
     226  
     227    qh = 0;
     228    if (r1 >= d1 && (r1 > d1 || r0 >= d0))
     229      {
     230  #if GMP_NAIL_BITS == 0
     231        sub_ddmmss (r1, r0, r1, r0, d1, d0);
     232  #else
     233        r0 = r0 - d0;
     234        r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
     235        r0 &= GMP_NUMB_MASK;
     236  #endif
     237        qh = 1;
     238      }
     239  
     240    for (i = nn - 2; i >= 2; i -= 2)
     241      {
     242        mp_limb_t n1, n0, q1, q0;
     243        n1 = np[i-1];
     244        n0 = np[i-2];
     245        udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
     246        qp[i-1] = q1;
     247        qp[i-2] = q0;
     248      }
     249  
     250    if (i > 0)
     251      {
     252        mp_limb_t q;
     253        udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
     254        qp[0] = q;
     255      }
     256    rp[1] = r1;
     257    rp[0] = r0;
     258  
     259    return qh;
     260  }
     261  
     262  
     263  /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
     264     significant quotient limbs at qp and the 2 long remainder at np.
     265     Return the most significant limb of the quotient.
     266  
     267     Preconditions:
     268     1. qp must either not overlap with the other operands at all, or
     269        qp >= np + 2 must hold true.  (This means that it's possible to put
     270        the quotient in the high part of {np,nn}, right above the remainder.)
     271     2. nn >= 2.  */
     272  
     273  mp_limb_t
     274  mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
     275  	      mp_srcptr dp)
     276  {
     277    mp_limb_t d1;
     278    mp_limb_t d0;
     279    gmp_pi1_t dinv;
     280  
     281    ASSERT (nn >= 2);
     282    ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
     283    ASSERT_MPN (np, nn);
     284    ASSERT_MPN (dp, 2);
     285  
     286    d1 = dp[1]; d0 = dp[0];
     287  
     288    ASSERT (d1 > 0);
     289  
     290    if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
     291      {
     292        if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
     293  	{
     294  	  gmp_pi1_t dinv;
     295  	  invert_pi1 (dinv, d1, d0);
     296  	  return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
     297  	}
     298        else
     299  	{
     300  	  mp_limb_t di[2];
     301  	  invert_4by2 (di, d1, d0);
     302  	  return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
     303  	}
     304      }
     305    else
     306      {
     307        int shift;
     308        count_leading_zeros (shift, d1);
     309        d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
     310        d0 <<= shift;
     311        invert_pi1 (dinv, d1, d0);
     312        return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
     313      }
     314  }