1  /* mpn_div_qr_1u_pi2.
       2  
       3     THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
       4     ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       5     GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       6  
       7  Copyright 2013, 2017 Free Software Foundation, Inc.
       8  
       9  This file is part of the GNU MP Library.
      10  
      11  The GNU MP Library is free software; you can redistribute it and/or modify
      12  it under the terms of either:
      13  
      14    * the GNU Lesser General Public License as published by the Free
      15      Software Foundation; either version 3 of the License, or (at your
      16      option) any later version.
      17  
      18  or
      19  
      20    * the GNU General Public License as published by the Free Software
      21      Foundation; either version 2 of the License, or (at your option) any
      22      later version.
      23  
      24  or both in parallel, as here.
      25  
      26  The GNU MP Library is distributed in the hope that it will be useful, but
      27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      29  for more details.
      30  
      31  You should have received copies of the GNU General Public License and the
      32  GNU Lesser General Public License along with the GNU MP Library.  If not,
      33  see https://www.gnu.org/licenses/.  */
      34  
      35  /* ISSUES:
      36  
      37     * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
      38  
      39     * Are there any problems with generating n quotient limbs in the q area?  It
      40       surely simplifies things.
      41  
      42     * Not yet adequately tested.
      43  */
      44  
      45  #include "gmp-impl.h"
      46  #include "longlong.h"
      47  
      48  /* Define some longlong.h-style macros, but for wider operations.
      49     * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
      50       an additional sum operand.
      51  */
      52  #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
      53  
      54  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
      55  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      56    __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
      57  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      58  	   : "0"  ((USItype)(s2)),					\
      59  	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
      60  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
      61  #endif
      62  
      63  #if defined (__amd64__) && W_TYPE_SIZE == 64
      64  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      65    __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
      66  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      67  	   : "0"  ((UDItype)(s2)),					\
      68  	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
      69  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
      70  #endif
      71  
      72  #if defined (__aarch64__) && W_TYPE_SIZE == 64
      73  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      74    __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\
      75  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      76  	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
      77  	     __CLOBBER_CC)
      78  #endif
      79  
      80  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
      81  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
      82     processor running in 32-bit mode, since the carry flag then gets the 32-bit
      83     carry.  */
      84  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      85    __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
      86  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      87  	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
      88  	     __CLOBBER_CC)
      89  #endif
      90  
      91  #endif /* __GNUC__ */
      92  
      93  #ifndef add_sssaaaa
      94  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      95    do {									\
      96      UWtype __s0, __s1, __c0, __c1;					\
      97      __s0 = (a0) + (b0);							\
      98      __s1 = (a1) + (b1);							\
      99      __c0 = __s0 < (a0);							\
     100      __c1 = __s1 < (a1);							\
     101      (s0) = __s0;							\
     102      __s1 = __s1 + __c0;							\
     103      (s1) = __s1;							\
     104      (s2) += __c1 + (__s1 < __c0);					\
     105    } while (0)
     106  #endif
     107  
     108  struct precomp_div_1_pi2
     109  {
     110    mp_limb_t dip[2];
     111    mp_limb_t d;
     112    int norm_cnt;
     113  };
     114  
     115  mp_limb_t
     116  mpn_div_qr_1u_pi2 (mp_ptr qp,
     117  		   mp_srcptr up, mp_size_t un,
     118  		   struct precomp_div_1_pi2 *pd)
     119  {
     120    mp_size_t i;
     121    mp_limb_t r, u2, u1, u0;
     122    mp_limb_t d0, di1, di0;
     123    mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
     124    mp_limb_t cnd;
     125    int cnt;
     126  
     127    ASSERT (un >= 2);
     128    ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
     129    ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
     130    ASSERT_MPN (up, un);
     131  
     132  #define q3 q3a
     133  #define q2 q2b
     134  #define q1 q1b
     135  
     136    up += un - 3;
     137    cnt = pd->norm_cnt;
     138    r = up[2] >> (GMP_NUMB_BITS - cnt);
     139    d0 = pd->d << cnt;
     140  
     141    qp += un - 2;
     142  
     143    di1 = pd->dip[1];
     144    di0 = pd->dip[0];
     145  
     146    for (i = un - 3; i >= 0; i -= 2)
     147      {
     148        u2 = r;
     149        u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
     150        u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
     151  
     152        /* Dividend in {r,u1,u0} */
     153  
     154        umul_ppmm (q1d,q0d, u1, di0);
     155        umul_ppmm (q2b,q1b, u1, di1);
     156        q2b++;				/* cannot spill */
     157        add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
     158  
     159        umul_ppmm (q2c,q1c, u2,  di0);
     160        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
     161        umul_ppmm (q3a,q2a, u2, di1);
     162  
     163        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
     164  
     165        q3 += r;
     166  
     167        r = u0 - q2 * d0;
     168  
     169        cnd = (r >= q1);
     170        r += d0 & -cnd;
     171        sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
     172  
     173        if (UNLIKELY (r >= d0))
     174  	{
     175  	  r -= d0;
     176  	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
     177  	}
     178  
     179        qp[0] = q2;
     180        qp[1] = q3;
     181  
     182        up -= 2;
     183        qp -= 2;
     184      }
     185  
     186    if ((un & 1) != 0)
     187      {
     188        u2 = r;
     189        u1 = (up[2] << cnt);
     190  
     191        udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
     192        qp[1] = q3;
     193      }
     194    else
     195      {
     196        u2 = r;
     197        u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
     198        u0 = (up[1] << cnt);
     199  
     200        /* Dividend in {r,u1,u0} */
     201  
     202        umul_ppmm (q1d,q0d, u1, di0);
     203        umul_ppmm (q2b,q1b, u1, di1);
     204        q2b++;				/* cannot spill */
     205        add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
     206  
     207        umul_ppmm (q2c,q1c, u2,  di0);
     208        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
     209        umul_ppmm (q3a,q2a, u2, di1);
     210  
     211        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
     212  
     213        q3 += r;
     214  
     215        r = u0 - q2 * d0;
     216  
     217        cnd = (r >= q1);
     218        r += d0 & -cnd;
     219        sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
     220  
     221        if (UNLIKELY (r >= d0))
     222  	{
     223  	  r -= d0;
     224  	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
     225  	}
     226  
     227        qp[0] = q2;
     228        qp[1] = q3;
     229      }
     230  
     231    return r >> cnt;
     232  
     233  #undef q3
     234  #undef q2
     235  #undef q1
     236  }