1  /* mpn_div_qr_1n_pi1
       2  
       3     Contributed to the GNU project by Niels Möller
       4  
       5     THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
       6     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       7     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       8  
       9  
      10  Copyright 2013 Free Software Foundation, Inc.
      11  
      12  This file is part of the GNU MP Library.
      13  
      14  The GNU MP Library is free software; you can redistribute it and/or modify
      15  it under the terms of either:
      16  
      17    * the GNU Lesser General Public License as published by the Free
      18      Software Foundation; either version 3 of the License, or (at your
      19      option) any later version.
      20  
      21  or
      22  
      23    * the GNU General Public License as published by the Free Software
      24      Foundation; either version 2 of the License, or (at your option) any
      25      later version.
      26  
      27  or both in parallel, as here.
      28  
      29  The GNU MP Library is distributed in the hope that it will be useful, but
      30  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      31  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      32  for more details.
      33  
      34  You should have received copies of the GNU General Public License and the
      35  GNU Lesser General Public License along with the GNU MP Library.  If not,
      36  see https://www.gnu.org/licenses/.  */
      37  
      38  #include "gmp-impl.h"
      39  #include "longlong.h"
      40  
      41  #if GMP_NAIL_BITS > 0
      42  #error Nail bits not supported
      43  #endif
      44  
      45  #ifndef DIV_QR_1N_METHOD
      46  #define DIV_QR_1N_METHOD 2
      47  #endif
      48  
      49  /* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
      50  
      51  #if defined (__GNUC__) && ! defined (NO_ASM)
      52  
      53  #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
      54  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
      55    __asm__ (  "add	%6, %k2\n\t"					\
      56  	     "adc	%4, %k1\n\t"					\
      57  	     "sbb	%k0, %k0"					\
      58  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
      59  	   : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),			\
      60  	     "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
      61  #endif
      62  
      63  #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
      64  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
      65    __asm__ (  "add	%6, %q2\n\t"					\
      66  	     "adc	%4, %q1\n\t"					\
      67  	     "sbb	%q0, %q0"					\
      68  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
      69  	   : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),		\
      70  	     "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
      71  #endif
      72  
      73  #if defined (__sparc__) && W_TYPE_SIZE == 32
      74  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
      75    __asm__ (  "addcc	%r5, %6, %2\n\t"				\
      76  	     "addxcc	%r3, %4, %1\n\t"				\
      77  	     "subx	%%g0, %%g0, %0"					\
      78  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
      79  	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
      80  	 __CLOBBER_CC)
      81  #endif
      82  
      83  #if defined (__sparc__) && W_TYPE_SIZE == 64
      84  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
      85    __asm__ (  "addcc	%r5, %6, %2\n\t"				\
      86  	     "addccc	%r7, %8, %%g0\n\t"				\
      87  	     "addccc	%r3, %4, %1\n\t"				\
      88  	     "clr	%0\n\t"						\
      89  	     "movcs	%%xcc, -1, %0"					\
      90  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
      91  	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),		\
      92  	     "rJ" ((al) >> 32), "rI" ((bl) >> 32)			\
      93  	 __CLOBBER_CC)
      94  #if __VIS__ >= 0x300
      95  #undef add_mssaaaa
      96  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
      97    __asm__ (  "addcc	%r5, %6, %2\n\t"				\
      98  	     "addxccc	%r3, %4, %1\n\t"				\
      99  	     "clr	%0\n\t"						\
     100  	     "movcs	%%xcc, -1, %0"					\
     101  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
     102  	   : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)		\
     103  	 __CLOBBER_CC)
     104  #endif
     105  #endif
     106  
     107  #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
     108  /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
     109     processor running in 32-bit mode, since the carry flag then gets the 32-bit
     110     carry.  */
     111  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
     112    __asm__ (  "add%I6c	%2, %5, %6\n\t"					\
     113  	     "adde	%1, %3, %4\n\t"					\
     114  	     "subfe	%0, %0, %0\n\t"					\
     115  	     "nor	%0, %0, %0"					\
     116  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
     117  	   : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0)			\
     118  	     __CLOBBER_CC)
     119  #endif
     120  
     121  #if defined (__s390x__) && W_TYPE_SIZE == 64
     122  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
     123    __asm__ (  "algr	%2, %6\n\t"					\
     124  	     "alcgr	%1, %4\n\t"					\
     125  	     "lghi	%0, 0\n\t"					\
     126  	     "alcgr	%0, %0\n\t"					\
     127  	     "lcgr	%0, %0"						\
     128  	   : "=r" (m), "=r" (s1), "=&r" (s0)				\
     129  	   : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),			\
     130  	     "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
     131  #endif
     132  
     133  #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
     134  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
     135    __asm__ (  "adds	%2, %5, %6\n\t"					\
     136  	     "adcs	%1, %3, %4\n\t"					\
     137  	     "movcc	%0, #0\n\t"					\
     138  	     "movcs	%0, #-1"					\
     139  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
     140  	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
     141  #endif
     142  
     143  #if defined (__aarch64__) && W_TYPE_SIZE == 64
     144  #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)				\
     145    __asm__ (  "adds	%2, %x5, %6\n\t"				\
     146  	     "adcs	%1, %x3, %x4\n\t"				\
     147  	     "csinv	%0, xzr, xzr, cc\n\t"				\
     148  	   : "=r" (m), "=r" (sh), "=&r" (sl)				\
     149  	   : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC)
     150  #endif
     151  #endif /* defined (__GNUC__) */
     152  
     153  #ifndef add_mssaaaa
     154  #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)				\
     155    do {									\
     156      UWtype __s0, __s1, __c0, __c1;					\
     157      __s0 = (a0) + (b0);							\
     158      __s1 = (a1) + (b1);							\
     159      __c0 = __s0 < (a0);							\
     160      __c1 = __s1 < (a1);							\
     161      (s0) = __s0;							\
     162      __s1 = __s1 + __c0;							\
     163      (s1) = __s1;							\
     164      (m) = - (__c1 + (__s1 < __c0));					\
     165    } while (0)
     166  #endif
     167  
     168  #if DIV_QR_1N_METHOD == 1
     169  
     170  /* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
     171     Requires that uh < d. */
     172  mp_limb_t
     173  mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
     174  		   mp_limb_t d, mp_limb_t dinv)
     175  {
     176    ASSERT (n > 0);
     177    ASSERT (uh < d);
     178    ASSERT (d & GMP_NUMB_HIGHBIT);
     179    ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
     180  
     181    do
     182      {
     183        mp_limb_t q, ul;
     184  
     185        ul = up[--n];
     186        udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
     187        qp[n] = q;
     188      }
     189    while (n > 0);
     190  
     191    return uh;
     192  }
     193  
     194  #elif DIV_QR_1N_METHOD == 2
     195  
     196  /* The main idea of this algorithm is to write B^2 = d (B + dinv) +
     197     B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration
     198     can then replace
     199  
     200       u1 B^2 = u1 B2 (mod d)
     201  
     202     which gives a very short critical path for computing the remainder
     203     (with some tricks to handle the carry when the next two lower limbs
     204     are added in). To also get the quotient, include the corresponding
     205     multiple of d in the expression,
     206  
     207       u1 B^2 = u1 B2 + (u1 dinv + u1 B) d
     208  
     209     We get the quotient by accumulating the (u1 dinv + u1 B) terms. The
     210     two multiplies, u1 * B2 and u1 * dinv, are independent, and can be
     211     executed in parallel.
     212   */
     213  mp_limb_t
     214  mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
     215  		   mp_limb_t d, mp_limb_t dinv)
     216  {
     217    mp_limb_t B2;
     218    mp_limb_t u0, u2;
     219    mp_limb_t q0, q1;
     220    mp_limb_t p0, p1;
     221    mp_limb_t t;
     222    mp_size_t j;
     223  
     224    ASSERT (d & GMP_LIMB_HIGHBIT);
     225    ASSERT (n > 0);
     226    ASSERT (u1 < d);
     227  
     228    if (n == 1)
     229      {
     230        udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
     231        return u1;
     232      }
     233  
     234    /* FIXME: Could be precomputed */
     235    B2 = -d*dinv;
     236  
     237    umul_ppmm (q1, q0, dinv, u1);
     238    umul_ppmm (p1, p0, B2, u1);
     239    q1 += u1;
     240    ASSERT (q1 >= u1);
     241    u0 = up[n-1];	/* Early read, to allow qp == up. */
     242    qp[n-1] = q1;
     243  
     244    add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
     245  
     246    /* FIXME: Keep q1 in a variable between iterations, to reduce number
     247       of memory accesses. */
     248    for (j = n-2; j-- > 0; )
     249      {
     250        mp_limb_t q2, cy;
     251  
     252        /* Additions for the q update:
     253         *	+-------+
     254         *        |u1 * v |
     255         *        +---+---+
     256         *        | u1|
     257         *    +---+---+
     258         *    | 1 | v |  (conditional on u2)
     259         *    +---+---+
     260         *        | 1 |  (conditional on u0 + u2 B2 carry)
     261         *        +---+
     262         * +      | q0|
     263         *   -+---+---+---+
     264         *    | q2| q1| q0|
     265         *    +---+---+---+
     266        */
     267        umul_ppmm (p1, t, u1, dinv);
     268        ADDC_LIMB (cy, u0, u0, u2 & B2);
     269        u0 -= (-cy) & d;
     270        add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
     271        add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
     272        q0 = t;
     273  
     274        /* Note that p1 + cy cannot overflow */
     275        add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy);
     276  
     277        umul_ppmm (p1, p0, u1, B2);
     278  
     279        qp[j+1] = q1;
     280        MPN_INCR_U (qp+j+2, n-j-2, q2);
     281  
     282        add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
     283      }
     284  
     285    q1 = (u2 > 0);
     286    u1 -= (-q1) & d;
     287  
     288    t = (u1 >= d);
     289    q1 += t;
     290    u1 -= (-t) & d;
     291  
     292    udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
     293    add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
     294  
     295    MPN_INCR_U (qp+1, n-1, q1);
     296  
     297    qp[0] = q0;
     298    return u0;
     299  }
     300  
     301  #elif DIV_QR_1N_METHOD == 3
     302  
     303  /* This variant handles carry from the u update earlier. This gives a
     304     longer critical path, but reduces the work needed for the
     305     quotients. */
     306  mp_limb_t
     307  mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
     308  		   mp_limb_t d, mp_limb_t dinv)
     309  {
     310    mp_limb_t B2;
     311    mp_limb_t cy, u0;
     312    mp_limb_t q0, q1;
     313    mp_limb_t p0, p1;
     314    mp_limb_t t;
     315    mp_size_t j;
     316  
     317    ASSERT (d & GMP_LIMB_HIGHBIT);
     318    ASSERT (n > 0);
     319    ASSERT (u1 < d);
     320  
     321    if (n == 1)
     322      {
     323        udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
     324        return u1;
     325      }
     326  
     327    /* FIXME: Could be precomputed */
     328    B2 = -d*dinv;
     329  
     330    umul_ppmm (q1, q0, dinv, u1);
     331    umul_ppmm (p1, p0, B2, u1);
     332    q1 += u1;
     333    ASSERT (q1 >= u1);
     334    u0 = up[n-1];	/* Early read, to allow qp == up. */
     335  
     336    add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
     337    u1 -= cy & d;
     338    q1 -= cy;
     339    qp[n-1] = q1;
     340  
     341    /* FIXME: Keep q1 in a variable between iterations, to reduce number
     342       of memory accesses. */
     343    for (j = n-2; j-- > 0; )
     344      {
     345        mp_limb_t q2, cy;
     346        mp_limb_t t1, t0;
     347  
     348        /* Additions for the q update:
     349         *	+-------+
     350         *        |u1 * v |
     351         *        +---+---+
     352         *        | u1|
     353         *        +---+
     354         *        | 1 |  (conditional on {u1, u0} carry)
     355         *        +---+
     356         * +      | q0|
     357         *   -+---+---+---+
     358         *    | q2| q1| q0|
     359         *    +---+---+---+
     360         *
     361         * Additions for the u update:
     362         *        +-------+
     363         *        |u1 * B2|
     364         *        +---+---+
     365         *      + |u0 |u-1|
     366         *        +---+---+
     367         *      - | d |     (conditional on carry)
     368         *     ---+---+---+
     369         *        |u1 | u0|
     370         *        +---+---+
     371         *
     372        */
     373        umul_ppmm (p1, p0, u1, B2);
     374        ADDC_LIMB (q2, q1, u1, q0);
     375        umul_ppmm (t1, t0, u1, dinv);
     376        add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
     377        u1 -= cy & d;
     378  
     379        /* t1 <= B-2, so cy can be added in without overflow. */
     380        add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
     381        q0 = t0;
     382  
     383        /* Final q update */
     384        qp[j+1] = q1;
     385        MPN_INCR_U (qp+j+2, n-j-2, q2);
     386      }
     387  
     388    q1 = (u1 >= d);
     389    u1 -= (-q1) & d;
     390  
     391    udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
     392    add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
     393  
     394    MPN_INCR_U (qp+1, n-1, q1);
     395  
     396    qp[0] = q0;
     397    return u0;
     398  }
     399  
     400  #elif DIV_QR_1N_METHOD == 4
     401  
     402  mp_limb_t
     403  mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
     404  		   mp_limb_t d, mp_limb_t dinv)
     405  {
     406    mp_limb_t B2;
     407    mp_limb_t u2, u0;
     408    mp_limb_t q0, q1;
     409    mp_limb_t p0, p1;
     410    mp_limb_t B2d0, B2d1;
     411    mp_limb_t t;
     412    mp_size_t j;
     413  
     414    ASSERT (d & GMP_LIMB_HIGHBIT);
     415    ASSERT (n > 0);
     416    ASSERT (u1 < d);
     417  
     418    if (n == 1)
     419      {
     420        udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
     421        return u1;
     422      }
     423  
     424    /* FIXME: Could be precomputed */
     425    B2 = -d*dinv;
     426    /* B2 * (B-d) */
     427    umul_ppmm (B2d1, B2d0, B2, -d);
     428  
     429    umul_ppmm (q1, q0, dinv, u1);
     430    umul_ppmm (p1, p0, B2, u1);
     431    q1 += u1;
     432    ASSERT (q1 >= u1);
     433  
     434    add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0);
     435  
     436    /* After read of up[n-1], to allow qp == up. */
     437    qp[n-1] = q1 - u2;
     438  
     439    /* FIXME: Keep q1 in a variable between iterations, to reduce number
     440       of memory accesses. */
     441    for (j = n-2; j-- > 0; )
     442      {
     443        mp_limb_t q2, cy;
     444        mp_limb_t t1, t0;
     445  
     446        /* Additions for the q update. *After* u1 -= u2 & d adjustment.
     447         *	+-------+
     448         *        |u1 * v |
     449         *        +---+---+
     450         *        | u1|
     451         *        +---+
     452         *        | 1 |  (conditional on {u1, u0} carry)
     453         *        +---+
     454         * +      | q0|
     455         *   -+---+---+---+
     456         *    | q2| q1| q0|
     457         *    +---+---+---+
     458         *
     459         * Additions for the u update. *Before* u1 -= u2 & d adjstment.
     460         *        +-------+
     461         *        |u1 * B2|
     462         *        +---+---+
     463         *        |u0 |u-1|
     464         *        +---+---+
     465         + +      |B2(B-d)| (conditional on u2)
     466         *   -+---+---+---+
     467         *    |u2 |u1 | u0|
     468         *    +---+---+---+
     469         *
     470        */
     471         /* Multiply with unadjusted u1, to shorten critical path. */
     472        umul_ppmm (p1, p0, u1, B2);
     473        u1 -= (d & u2);
     474        ADDC_LIMB (q2, q1, u1, q0);
     475        umul_ppmm (t1, t0, u1, dinv);
     476  
     477        add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0);
     478        add_mssaaaa (u2, u1, u0, u1, u0, p1, p0);
     479        u2 += cy;
     480        ASSERT(-u2 <= 1);
     481  
     482        /* t1 <= B-2, so u2 can be added in without overflow. */
     483        add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2);
     484        q0 = t0;
     485  
     486        /* Final q update */
     487        qp[j+1] = q1;
     488        MPN_INCR_U (qp+j+2, n-j-2, q2);
     489      }
     490    u1 -= u2 & d;
     491  
     492    q1 = (u1 >= d);
     493    u1 -= (-q1) & d;
     494  
     495    udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
     496    add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
     497  
     498    MPN_INCR_U (qp+1, n-1, q1);
     499  
     500    qp[0] = q0;
     501    return u0;
     502  }
     503  #else
     504  #error Unknown DIV_QR_1N_METHOD
     505  #endif