1  /* mpn_div_qr_1n_pi2.
       2  
       3     THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
       4     ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       5     GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       6  
       7  Copyright 2013, 2017 Free Software Foundation, Inc.
       8  
       9  This file is part of the GNU MP Library.
      10  
      11  The GNU MP Library is free software; you can redistribute it and/or modify
      12  it under the terms of either:
      13  
      14    * the GNU Lesser General Public License as published by the Free
      15      Software Foundation; either version 3 of the License, or (at your
      16      option) any later version.
      17  
      18  or
      19  
      20    * the GNU General Public License as published by the Free Software
      21      Foundation; either version 2 of the License, or (at your option) any
      22      later version.
      23  
      24  or both in parallel, as here.
      25  
      26  The GNU MP Library is distributed in the hope that it will be useful, but
      27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      29  for more details.
      30  
      31  You should have received copies of the GNU General Public License and the
      32  GNU Lesser General Public License along with the GNU MP Library.  If not,
      33  see https://www.gnu.org/licenses/.  */
      34  
      35  /* ISSUES:
      36  
      37     * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
      38  
      39     * Are there any problems with generating n quotient limbs in the q area?  It
      40       surely simplifies things.
      41  
      42     * Not yet adequately tested.
      43  */
      44  
      45  #include "gmp-impl.h"
      46  #include "longlong.h"
      47  
      48  /* Define some longlong.h-style macros, but for wider operations.
      49     * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into
      50       an additional sum operand.
      51  */
      52  #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
      53  
      54  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
      55  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      56    __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"		\
      57  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      58  	   : "0"  ((USItype)(s2)),					\
      59  	     "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
      60  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
      61  #endif
      62  
      63  #if defined (__amd64__) && W_TYPE_SIZE == 64
      64  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      65    __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"		\
      66  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      67  	   : "0"  ((UDItype)(s2)),					\
      68  	     "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
      69  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
      70  #endif
      71  
      72  #if defined (__aarch64__) && W_TYPE_SIZE == 64
      73  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      74    __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\
      75  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      76  	   : "rZ" (s2), "%rZ"  (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0)	\
      77  	     __CLOBBER_CC)
      78  #endif
      79  
      80  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
      81  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
      82     processor running in 32-bit mode, since the carry flag then gets the 32-bit
      83     carry.  */
      84  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      85    __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3"	\
      86  	   : "=r" (s2), "=&r" (s1), "=&r" (s0)				\
      87  	   : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)	\
      88  	     __CLOBBER_CC)
      89  #endif
      90  
      91  #endif /* __GNUC__ */
      92  
      93  #ifndef add_sssaaaa
      94  #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)				\
      95    do {									\
      96      UWtype __s0, __s1, __c0, __c1;					\
      97      __s0 = (a0) + (b0);							\
      98      __s1 = (a1) + (b1);							\
      99      __c0 = __s0 < (a0);							\
     100      __c1 = __s1 < (a1);							\
     101      (s0) = __s0;							\
     102      __s1 = __s1 + __c0;							\
     103      (s1) = __s1;							\
     104      (s2) += __c1 + (__s1 < __c0);					\
     105    } while (0)
     106  #endif
     107  
     108  struct precomp_div_1_pi2
     109  {
     110    mp_limb_t dip[2];
     111    mp_limb_t d;
     112    int norm_cnt;
     113  };
     114  
     115  mp_limb_t
     116  mpn_div_qr_1n_pi2 (mp_ptr qp,
     117  		   mp_srcptr up, mp_size_t un,
     118  		   struct precomp_div_1_pi2 *pd)
     119  {
     120    mp_limb_t most_significant_q_limb;
     121    mp_size_t i;
     122    mp_limb_t r, u2, u1, u0;
     123    mp_limb_t d0, di1, di0;
     124    mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
     125    mp_limb_t cnd;
     126  
     127    ASSERT (un >= 2);
     128    ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
     129    ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
     130    ASSERT_MPN (up, un);
     131  
     132  #define q3 q3a
     133  #define q2 q2b
     134  #define q1 q1b
     135  
     136    up += un - 3;
     137    r = up[2];
     138    d0 = pd->d;
     139  
     140    most_significant_q_limb = (r >= d0);
     141    r -= d0 & -most_significant_q_limb;
     142  
     143    qp += un - 3;
     144    qp[2] = most_significant_q_limb;
     145  
     146    di1 = pd->dip[1];
     147    di0 = pd->dip[0];
     148  
     149    for (i = un - 3; i >= 0; i -= 2)
     150      {
     151        u2 = r;
     152        u1 = up[1];
     153        u0 = up[0];
     154  
     155        /* Dividend in {r,u1,u0} */
     156  
     157        umul_ppmm (q1d,q0d, u1, di0);
     158        umul_ppmm (q2b,q1b, u1, di1);
     159        q2b++;				/* cannot spill */
     160        add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
     161  
     162        umul_ppmm (q2c,q1c, u2,  di0);
     163        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
     164        umul_ppmm (q3a,q2a, u2, di1);
     165  
     166        add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
     167  
     168        q3 += r;
     169  
     170        r = u0 - q2 * d0;
     171  
     172        cnd = (r >= q1);
     173        r += d0 & -cnd;
     174        sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
     175  
     176        if (UNLIKELY (r >= d0))
     177  	{
     178  	  r -= d0;
     179  	  add_ssaaaa (q3,q2,  q3,q2,  0,1);
     180  	}
     181  
     182        qp[0] = q2;
     183        qp[1] = q3;
     184  
     185        up -= 2;
     186        qp -= 2;
     187      }
     188  
     189    if ((un & 1) == 0)
     190      {
     191        u2 = r;
     192        u1 = up[1];
     193  
     194        udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
     195        qp[1] = q3;
     196      }
     197  
     198    return r;
     199  
     200  #undef q3
     201  #undef q2
     202  #undef q1
     203  }