1  /* mpn_sqrtrem -- square root and remainder
       2  
       3     Contributed to the GNU project by Paul Zimmermann (most code),
       4     Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt).
       5  
       6     THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE
       7     INTERFACES.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.
       8     IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A
       9     FUTURE GMP RELEASE.
      10  
      11  Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software
      12  Foundation, Inc.
      13  
      14  This file is part of the GNU MP Library.
      15  
      16  The GNU MP Library is free software; you can redistribute it and/or modify
      17  it under the terms of either:
      18  
      19    * the GNU Lesser General Public License as published by the Free
      20      Software Foundation; either version 3 of the License, or (at your
      21      option) any later version.
      22  
      23  or
      24  
      25    * the GNU General Public License as published by the Free Software
      26      Foundation; either version 2 of the License, or (at your option) any
      27      later version.
      28  
      29  or both in parallel, as here.
      30  
      31  The GNU MP Library is distributed in the hope that it will be useful, but
      32  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      33  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      34  for more details.
      35  
      36  You should have received copies of the GNU General Public License and the
      37  GNU Lesser General Public License along with the GNU MP Library.  If not,
      38  see https://www.gnu.org/licenses/.  */
      39  
      40  
      41  /* See "Karatsuba Square Root", reference in gmp.texi.  */
      42  
      43  
      44  #include <stdio.h>
      45  #include <stdlib.h>
      46  
      47  #include "gmp-impl.h"
      48  #include "longlong.h"
      49  #define USE_DIVAPPR_Q 1
      50  #define TRACE(x)
      51  
      52  static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */
      53  {
      54    0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */
      55    0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */
      56    0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */
      57    0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */
      58    0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */
      59    0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */
      60    0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */
      61    0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */
      62    0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */
      63    0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */
      64    0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */
      65    0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */
      66    0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */
      67    0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */
      68    0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */
      69    0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */
      70    0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */
      71    0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */
      72    0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */
      73    0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */
      74    0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */
      75    0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */
      76    0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */
      77    0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */
      78    0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */
      79    0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */
      80    0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */
      81    0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */
      82    0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */
      83    0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */
      84    0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */
      85    0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */
      86    0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */
      87    0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */
      88    0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */
      89    0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */
      90    0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */
      91    0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */
      92    0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */
      93    0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */
      94    0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */
      95    0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */
      96    0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */
      97    0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */
      98    0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */
      99    0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */
     100    0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */
     101    0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00  /* sqrt(1/1f8)..sqrt(1/1ff) */
     102  };
     103  
     104  /* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
     105  
     106  #if GMP_NUMB_BITS > 32
     107  #define MAGIC CNST_LIMB(0x10000000000)	/* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
     108  #else
     109  #define MAGIC CNST_LIMB(0x100000)		/* 0xfee6f < MAGIC < 0x29cbc8 */
     110  #endif
     111  
     112  static mp_limb_t
     113  mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
     114  {
     115  #if GMP_NUMB_BITS > 32
     116    mp_limb_t a1;
     117  #endif
     118    mp_limb_t x0, t2, t, x2;
     119    unsigned abits;
     120  
     121    ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
     122    ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
     123    ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
     124  
     125    /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
     126       since we can do the former without division.  As part of the last
     127       iteration convert from 1/sqrt(a) to sqrt(a).  */
     128  
     129    abits = a0 >> (GMP_LIMB_BITS - 1 - 8);	/* extract bits for table lookup */
     130    x0 = 0x100 | invsqrttab[abits - 0x80];	/* initial 1/sqrt(a) */
     131  
     132    /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
     133  
     134  #if GMP_NUMB_BITS > 32
     135    a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
     136    t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16;
     137    x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
     138  
     139    /* x0 is now a 16 bits approximation of 1/sqrt(a0) */
     140  
     141    t2 = x0 * (a0 >> (32-8));
     142    t = t2 >> 25;
     143    t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
     144    x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
     145    x0 >>= 32;
     146  #else
     147    t2 = x0 * (a0 >> (16-8));
     148    t = t2 >> 13;
     149    t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
     150    x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
     151    x0 >>= 16;
     152  #endif
     153  
     154    /* x0 is now a full limb approximation of sqrt(a0) */
     155  
     156    x2 = x0 * x0;
     157    if (x2 + 2*x0 <= a0 - 1)
     158      {
     159        x2 += 2*x0 + 1;
     160        x0++;
     161      }
     162  
     163    *rp = a0 - x2;
     164    return x0;
     165  }
     166  
     167  
     168  #define Prec (GMP_NUMB_BITS >> 1)
     169  #if ! defined(SQRTREM2_INPLACE)
     170  #define SQRTREM2_INPLACE 0
     171  #endif
     172  
     173  /* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
     174     return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
     175  #if SQRTREM2_INPLACE
     176  #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp)
     177  static mp_limb_t
     178  mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp)
     179  {
     180    mp_srcptr np = rp;
     181  #else
     182  #define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp)
     183  static mp_limb_t
     184  mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
     185  {
     186  #endif
     187    mp_limb_t q, u, np0, sp0, rp0, q2;
     188    int cc;
     189  
     190    ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
     191  
     192    np0 = np[0];
     193    sp0 = mpn_sqrtrem1 (rp, np[1]);
     194    rp0 = rp[0];
     195    /* rp0 <= 2*sp0 < 2^(Prec + 1) */
     196    rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1));
     197    q = rp0 / sp0;
     198    /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */
     199    q -= q >> Prec;
     200    /* now we have q < 2^Prec */
     201    u = rp0 - q * sp0;
     202    /* now we have (rp[0]<<Prec + np0>>Prec)/2 = q * sp0 + u */
     203    sp0 = (sp0 << Prec) | q;
     204    cc = u >> (Prec - 1);
     205    rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1));
     206    /* subtract q * q from rp */
     207    q2 = q * q;
     208    cc -= rp0 < q2;
     209    rp0 -= q2;
     210    if (cc < 0)
     211      {
     212        rp0 += sp0;
     213        cc += rp0 < sp0;
     214        --sp0;
     215        rp0 += sp0;
     216        cc += rp0 < sp0;
     217      }
     218  
     219    rp[0] = rp0;
     220    sp[0] = sp0;
     221    return cc;
     222  }
     223  
     224  /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
     225     and in {np, n} the low n limbs of the remainder, returns the high
     226     limb of the remainder (which is 0 or 1).
     227     Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
     228     where B=2^GMP_NUMB_BITS.
     229     Needs a scratch of n/2+1 limbs. */
     230  static mp_limb_t
     231  mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch)
     232  {
     233    mp_limb_t q;			/* carry out of {sp, n} */
     234    int c, b;			/* carry out of remainder */
     235    mp_size_t l, h;
     236  
     237    ASSERT (n > 1);
     238    ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
     239  
     240    l = n / 2;
     241    h = n - l;
     242    if (h == 1)
     243      q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l);
     244    else
     245      q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch);
     246    if (q != 0)
     247      ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h));
     248    TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1)));
     249    mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h);
     250    q += scratch[l];
     251    c = scratch[0] & 1;
     252    mpn_rshift (sp, scratch, l, 1);
     253    sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
     254    if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */
     255      return 1; /* Remainder is non-zero */
     256    q >>= 1;
     257    if (c != 0)
     258      c = mpn_add_n (np + l, np + l, sp + l, h);
     259    TRACE(printf("sqr(,,%u)\n", (unsigned) l));
     260    mpn_sqr (np + n, sp, l);
     261    b = q + mpn_sub_n (np, np, np + n, 2 * l);
     262    c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
     263  
     264    if (c < 0)
     265      {
     266        q = mpn_add_1 (sp + l, sp + l, h, q);
     267  #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
     268        c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q;
     269  #else
     270        c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
     271  #endif
     272        c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
     273        q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
     274      }
     275  
     276    return c;
     277  }
     278  
     279  #if USE_DIVAPPR_Q
     280  static void
     281  mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch)
     282  {
     283    gmp_pi1_t inv;
     284    mp_limb_t qh;
     285    ASSERT (dn > 2);
     286    ASSERT (nn >= dn);
     287    ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0);
     288  
     289    MPN_COPY (scratch, np, nn);
     290    invert_pi1 (inv, dp[dn-1], dp[dn-2]);
     291    if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD))
     292      qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32);
     293    else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD))
     294      qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv);
     295    else
     296      {
     297        mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0);
     298        TMP_DECL;
     299        TMP_MARK;
     300        /* Sadly, scratch is too small. */
     301        qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch));
     302        TMP_FREE;
     303      }
     304    qp [nn - dn] = qh;
     305  }
     306  #endif
     307  
     308  /* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd},
     309     returns zero if the operand was a perfect square, one otherwise.
     310     Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4
     311     where B=2^GMP_NUMB_BITS.
     312     THINK: In the odd case, three more (dummy) limbs are taken into account,
     313     when nsh is maximal, two limbs are discarded from the result of the
     314     division. Too much? Is a single dummy limb enough? */
     315  static int
     316  mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd)
     317  {
     318    mp_limb_t q;			/* carry out of {sp, n} */
     319    int c;			/* carry out of remainder */
     320    mp_size_t l, h;
     321    mp_ptr qp, tp, scratch;
     322    TMP_DECL;
     323    TMP_MARK;
     324  
     325    ASSERT (np[2 * n - 1 - odd] != 0);
     326    ASSERT (n > 4);
     327    ASSERT (nsh < GMP_NUMB_BITS / 2);
     328  
     329    l = (n - 1) / 2;
     330    h = n - l;
     331    ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd);
     332    scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */
     333    tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */
     334    if (nsh != 0)
     335      {
     336        /* o is used to exactly set the lowest bits of the dividend, is it needed? */
     337        int o = l > (1 + odd);
     338        ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh));
     339      }
     340    else
     341      MPN_COPY (tp, np + l - 1 - odd, n + h + 1);
     342    q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch);
     343    if (q != 0)
     344      ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h));
     345    qp = tp + n + 1; /* l + 2 */
     346    TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1)));
     347  #if USE_DIVAPPR_Q
     348    mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch);
     349  #else
     350    mpn_div_q (qp, tp, n + 1, sp + l, h, scratch);
     351  #endif
     352    q += qp [l + 1];
     353    c = 1;
     354    if (q > 1)
     355      {
     356        /* FIXME: if s!=0 we will shift later, a noop on this area. */
     357        MPN_FILL (sp, l, GMP_NUMB_MAX);
     358      }
     359    else
     360      {
     361        /* FIXME: if s!=0 we will shift again later, shift just once. */
     362        mpn_rshift (sp, qp + 1, l, 1);
     363        sp[l - 1] |= q << (GMP_NUMB_BITS - 1);
     364        if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */
     365  	   (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0)
     366  	{
     367  	  mp_limb_t cy;
     368  	  /* Approximation is not good enough, the extra limb(+ nsh bits)
     369  	     is smaller than needed to absorb the possible error. */
     370  	  /* {qp + 1, l + 1} equals 2*{sp, l} */
     371  	  /* FIXME: use mullo or wrap-around, or directly evaluate
     372  	     remainder with a single sqrmod_bnm1. */
     373  	  TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1)));
     374  	  ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1));
     375  	  /* Compute the remainder of the previous mpn_div(appr)_q. */
     376  	  cy = mpn_sub_n (tp + 1, tp + 1, scratch, h);
     377  #if USE_DIVAPPR_Q || WANT_ASSERT
     378  	  MPN_DECR_U (tp + 1 + h, l, cy);
     379  #if USE_DIVAPPR_Q
     380  	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0);
     381  	  if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0)
     382  	    {
     383  	      /* May happen only if div result was not exact. */
     384  #if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n
     385  	      cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h);
     386  #else
     387  	      cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2));
     388  #endif
     389  	      ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy));
     390  	      MPN_DECR_U (sp, l, 1);
     391  	    }
     392  	  /* Can the root be exact when a correction was needed? We
     393  	     did not find an example, but it depends on divappr
     394  	     internals, and we can not assume it true in general...*/
     395  	  /* else */
     396  #else /* WANT_ASSERT */
     397  	  ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0);
     398  #endif
     399  #endif
     400  	  if (mpn_zero_p (tp + l + 1, h - l))
     401  	    {
     402  	      TRACE(printf("sqr(,,%u)\n", (unsigned) l));
     403  	      mpn_sqr (scratch, sp, l);
     404  	      c = mpn_cmp (tp + 1, scratch + l, l);
     405  	      if (c == 0)
     406  		{
     407  		  if (nsh != 0)
     408  		    {
     409  		      mpn_lshift (tp, np, l, 2 * nsh);
     410  		      np = tp;
     411  		    }
     412  		  c = mpn_cmp (np, scratch + odd, l - odd);
     413  		}
     414  	      if (c < 0)
     415  		{
     416  		  MPN_DECR_U (sp, l, 1);
     417  		  c = 1;
     418  		}
     419  	    }
     420  	}
     421      }
     422    TMP_FREE;
     423  
     424    if ((odd | nsh) != 0)
     425      mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0));
     426    return c;
     427  }
     428  
     429  
     430  mp_size_t
     431  mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
     432  {
     433    mp_limb_t cc, high, rl;
     434    int c;
     435    mp_size_t rn, tn;
     436    TMP_DECL;
     437  
     438    ASSERT (nn > 0);
     439    ASSERT_MPN (np, nn);
     440  
     441    ASSERT (np[nn - 1] != 0);
     442    ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
     443    ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
     444    ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
     445  
     446    high = np[nn - 1];
     447    if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2)))
     448      c = 0;
     449    else
     450      {
     451        count_leading_zeros (c, high);
     452        c -= GMP_NAIL_BITS;
     453  
     454        c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
     455      }
     456    if (nn == 1) {
     457      if (c == 0)
     458        {
     459  	sp[0] = mpn_sqrtrem1 (&rl, high);
     460  	if (rp != NULL)
     461  	  rp[0] = rl;
     462        }
     463      else
     464        {
     465  	cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c;
     466  	sp[0] = cc;
     467  	if (rp != NULL)
     468  	  rp[0] = rl = high - cc*cc;
     469        }
     470      return rl != 0;
     471    }
     472    if (nn == 2) {
     473      mp_limb_t tp [2];
     474      if (rp == NULL) rp = tp;
     475      if (c == 0)
     476        {
     477  #if SQRTREM2_INPLACE
     478  	rp[1] = high;
     479  	rp[0] = np[0];
     480  	cc = CALL_SQRTREM2_INPLACE (sp, rp);
     481  #else
     482  	cc = mpn_sqrtrem2 (sp, rp, np);
     483  #endif
     484  	rp[1] = cc;
     485  	return ((rp[0] | cc) != 0) + cc;
     486        }
     487      else
     488        {
     489  	rl = np[0];
     490  	rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c));
     491  	rp[0] = rl << (2*c);
     492  	CALL_SQRTREM2_INPLACE (sp, rp);
     493  	cc = sp[0] >>= c;	/* c != 0, the highest bit of the root cc is 0. */
     494  	rp[0] = rl -= cc*cc;	/* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */
     495  	return rl != 0;
     496        }
     497    }
     498    tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
     499  
     500    if ((rp == NULL) && (nn > 8))
     501      return mpn_dc_sqrt (sp, np, tn, c, nn & 1);
     502    TMP_MARK;
     503    if (((nn & 1) | c) != 0)
     504      {
     505        mp_limb_t s0[1], mask;
     506        mp_ptr tp, scratch;
     507        TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1);
     508        tp[0] = 0;	     /* needed only when 2*tn > nn, but saves a test */
     509        if (c != 0)
     510  	mpn_lshift (tp + (nn & 1), np, nn, 2 * c);
     511        else
     512  	MPN_COPY (tp + (nn & 1), np, nn);
     513        c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0;		/* c now represents k */
     514        mask = (CNST_LIMB (1) << c) - 1;
     515        rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch);
     516        /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
     517  	 thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
     518        s0[0] = sp[0] & mask;	/* S mod 2^k */
     519        rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);	/* R = R + 2*s0*S */
     520        cc = mpn_submul_1 (tp, s0, 1, s0[0]);
     521        rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
     522        mpn_rshift (sp, sp, tn, c);
     523        tp[tn] = rl;
     524        if (rp == NULL)
     525  	rp = tp;
     526        c = c << 1;
     527        if (c < GMP_NUMB_BITS)
     528  	tn++;
     529        else
     530  	{
     531  	  tp++;
     532  	  c -= GMP_NUMB_BITS;
     533  	}
     534        if (c != 0)
     535  	mpn_rshift (rp, tp, tn, c);
     536        else
     537  	MPN_COPY_INCR (rp, tp, tn);
     538        rn = tn;
     539      }
     540    else
     541      {
     542        if (rp != np)
     543  	{
     544  	  if (rp == NULL) /* nn <= 8 */
     545  	    rp = TMP_SALLOC_LIMBS (nn);
     546  	  MPN_COPY (rp, np, nn);
     547  	}
     548        rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1)));
     549      }
     550  
     551    MPN_NORMALIZE (rp, rn);
     552  
     553    TMP_FREE;
     554    return rn;
     555  }