1  /* mpfr_sqrt -- square root of a floating-point number
       2  
       3  Copyright 1999-2023 Free Software Foundation, Inc.
       4  Contributed by the AriC and Caramba projects, INRIA.
       5  
       6  This file is part of the GNU MPFR Library.
       7  
       8  The GNU MPFR Library is free software; you can redistribute it and/or modify
       9  it under the terms of the GNU Lesser General Public License as published by
      10  the Free Software Foundation; either version 3 of the License, or (at your
      11  option) any later version.
      12  
      13  The GNU MPFR Library is distributed in the hope that it will be useful, but
      14  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      15  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
      16  License for more details.
      17  
      18  You should have received a copy of the GNU Lesser General Public License
      19  along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
      20  https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
      21  51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
      22  
      23  #define MPFR_NEED_LONGLONG_H
      24  #include "mpfr-impl.h"
      25  
      26  #if !defined(MPFR_GENERIC_ABI) && GMP_NUMB_BITS == 64
      27  
      28  #include "invsqrt_limb.h"
      29  
      30  /* Put in rp[1]*2^64+rp[0] an approximation of floor(sqrt(2^128*n)),
      31     with 2^126 <= n := np[1]*2^64 + np[0] < 2^128. We have:
      32     {rp, 2} - 4 <= floor(sqrt(2^128*n)) <= {rp, 2} + 26. */
      33  static void
      34  mpfr_sqrt2_approx (mpfr_limb_ptr rp, mpfr_limb_srcptr np)
      35  {
      36    mp_limb_t x, r1, r0, h, l;
      37  
      38    __gmpfr_sqrt_limb (r1, h, l, x, np[1]);
      39  
      40    /* now r1 = floor(sqrt(2^64*n1)) and h:l = 2^64*n1 - r1^2 with h:l <= 2*r1,
      41       thus h <= 1, and x is an approximation of 2^96/sqrt(np[1])-2^64 */
      42  
      43    l += np[0];
      44    h += (l < np[0]);
      45  
      46    /* now 2^64*n1 + n0 - r1^2 = 2^64*h + l with h <= 2 */
      47  
      48    /* divide by 2 */
      49    l = (h << 63) | (l >> 1);
      50    h = h >> 1;
      51  
      52    /* now h <= 1 */
      53  
      54    /* now add (2^64+x) * (h*2^64+l) / 2^64 to [r1*2^64, 0] */
      55  
      56    umul_hi (r0, x, l); /* x * l */
      57    r0 += l;
      58    r1 += h + (r0 < l); /* now we have added 2^64 * (h*2^64+l) */
      59    if (h)
      60      {
      61        r0 += x;
      62        r1 += (r0 < x); /* add x */
      63      }
      64  
      65    MPFR_ASSERTD(r1 & MPFR_LIMB_HIGHBIT);
      66  
      67    rp[0] = r0;
      68    rp[1] = r1;
      69  }
      70  
      71  /* Special code for prec(r) = prec(u) < GMP_NUMB_BITS. We cannot have
      72     prec(u) = GMP_NUMB_BITS here, since when the exponent of u is odd,
      73     we need to shift u by one bit to the right without losing any bit.
      74     Assumes GMP_NUMB_BITS = 64. */
      75  static int
      76  mpfr_sqrt1 (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
      77  {
      78    mpfr_prec_t p = MPFR_GET_PREC(r);
      79    mpfr_prec_t exp_u = MPFR_EXP(u), exp_r, sh = GMP_NUMB_BITS - p;
      80    mp_limb_t u0, r0, rb, sb, mask = MPFR_LIMB_MASK(sh);
      81    mpfr_limb_ptr rp = MPFR_MANT(r);
      82  
      83    MPFR_STAT_STATIC_ASSERT (GMP_NUMB_BITS == 64);
      84  
      85    /* first make the exponent even */
      86    u0 = MPFR_MANT(u)[0];
      87    if (((unsigned int) exp_u & 1) != 0)
      88      {
      89        u0 >>= 1;
      90        exp_u ++;
      91      }
      92    MPFR_ASSERTD (((unsigned int) exp_u & 1) == 0);
      93    exp_r = exp_u / 2;
      94  
      95    /* then compute an approximation of the integer square root of
      96       u0*2^GMP_NUMB_BITS */
      97    __gmpfr_sqrt_limb_approx (r0, u0);
      98  
      99    sb = 1; /* when we can round correctly with the approximation, the sticky bit
     100               is non-zero */
     101  
     102    /* the exact square root is in [r0, r0 + 7] */
     103    if (MPFR_UNLIKELY(((r0 + 7) & (mask >> 1)) <= 7))
     104      {
     105        /* We should ensure r0 has its most significant bit set.
     106           Since r0 <= sqrt(2^64*u0) <= r0 + 7, as soon as sqrt(2^64*u0)>=2^63+7,
     107           which happens for u0>=2^62+8, then r0 >= 2^63.
     108           It thus remains to check that for 2^62 <= u0 <= 2^62+7,
     109           __gmpfr_sqrt_limb_approx (r0, u0) gives r0 >= 2^63, which is indeed
     110           the case:
     111           u0=4611686018427387904 r0=9223372036854775808
     112           u0=4611686018427387905 r0=9223372036854775808
     113           u0=4611686018427387906 r0=9223372036854775809
     114           u0=4611686018427387907 r0=9223372036854775810
     115           u0=4611686018427387908 r0=9223372036854775811
     116           u0=4611686018427387909 r0=9223372036854775812
     117           u0=4611686018427387910 r0=9223372036854775813
     118           u0=4611686018427387911 r0=9223372036854775814 */
     119        MPFR_ASSERTD(r0 >= MPFR_LIMB_HIGHBIT);
     120        umul_ppmm (rb, sb, r0, r0);
     121        sub_ddmmss (rb, sb, u0, 0, rb, sb);
     122        /* for the exact square root, we should have 0 <= rb:sb <= 2*r0 */
     123        while (!(rb == 0 || (rb == 1 && sb <= 2 * r0)))
     124          {
     125            /* subtract 2*r0+1 from rb:sb: subtract r0 before incrementing r0,
     126               then r0 after (which is r0+1) */
     127            rb -= (sb < r0);
     128            sb -= r0;
     129            r0 ++;
     130            rb -= (sb < r0);
     131            sb -= r0;
     132          }
     133        /* now we should have rb*2^64 + sb <= 2*r0 */
     134        MPFR_ASSERTD(rb == 0 || (rb == 1 && sb <= 2 * r0));
     135        sb = rb | sb;
     136      }
     137  
     138    rb = r0 & (MPFR_LIMB_ONE << (sh - 1));
     139    sb |= (r0 & mask) ^ rb;
     140    rp[0] = r0 & ~mask;
     141  
     142    /* rounding: sb = 0 implies rb = 0, since (rb,sb)=(1,0) is not possible */
     143    MPFR_ASSERTD (rb == 0 || sb != 0);
     144  
     145    /* Note: if 1 and 2 are in [emin,emax], no overflow nor underflow
     146       is possible */
     147    if (MPFR_UNLIKELY (exp_r > __gmpfr_emax))
     148      return mpfr_overflow (r, rnd_mode, 1);
     149  
     150    /* See comments in mpfr_div_1 */
     151    if (MPFR_UNLIKELY (exp_r < __gmpfr_emin))
     152      {
     153        if (rnd_mode == MPFR_RNDN)
     154          {
     155            /* If (1-2^(-p-1))*2^(emin-1) <= sqrt(u) < 2^(emin-1),
     156               then sqrt(u) would be rounded to 2^(emin-1) with unbounded
     157               exponent range, and there would be no underflow.
     158               But this case cannot happen if u has precision p.
     159               Indeed, we would have:
     160               (1-2^(-p-1))^2*2^(2*emin-2) <= u < 2^(2*emin-2), i.e.,
     161               (1-2^(-p)+2^(-2p-2))*2^(2*emin-2) <= u < 2^(2*emin-2),
     162               and there is no p-bit number in that interval. */
     163            /* If the result is <= 0.5^2^(emin-1), we should round to 0. */
     164            if (exp_r < __gmpfr_emin - 1 ||
     165                (rp[0] == MPFR_LIMB_HIGHBIT && sb == 0))
     166              rnd_mode = MPFR_RNDZ;
     167          }
     168        else if (MPFR_IS_LIKE_RNDA(rnd_mode, 0))
     169          {
     170            if (exp_r == __gmpfr_emin - 1 &&
     171                rp[0] == ~mask &&
     172                (rb | sb) != 0)
     173              goto rounding; /* no underflow */
     174          }
     175        return mpfr_underflow (r, rnd_mode, 1);
     176      }
     177  
     178   rounding:
     179    MPFR_EXP (r) = exp_r;
     180    if (sb == 0 /* implies rb = 0 */ || rnd_mode == MPFR_RNDF)
     181      {
     182        MPFR_ASSERTD (rb == 0 || rnd_mode == MPFR_RNDF);
     183        MPFR_ASSERTD(exp_r >= __gmpfr_emin);
     184        MPFR_ASSERTD(exp_r <= __gmpfr_emax);
     185        MPFR_RET (0);
     186      }
     187    else if (rnd_mode == MPFR_RNDN)
     188      {
     189        /* since sb <> 0, only rb is needed to decide how to round, and the exact
     190           middle is not possible */
     191        if (rb == 0)
     192          goto truncate;
     193        else
     194          goto add_one_ulp;
     195      }
     196    else if (MPFR_IS_LIKE_RNDZ(rnd_mode, 0))
     197      {
     198      truncate:
     199        MPFR_ASSERTD(exp_r >= __gmpfr_emin);
     200        MPFR_ASSERTD(exp_r <= __gmpfr_emax);
     201        MPFR_RET(-1);
     202      }
     203    else /* round away from zero */
     204      {
     205      add_one_ulp:
     206        rp[0] += MPFR_LIMB_ONE << sh;
     207        if (rp[0] == 0)
     208          {
     209            rp[0] = MPFR_LIMB_HIGHBIT;
     210            if (MPFR_UNLIKELY(exp_r + 1 > __gmpfr_emax))
     211              return mpfr_overflow (r, rnd_mode, 1);
     212            MPFR_ASSERTD(exp_r + 1 <= __gmpfr_emax);
     213            MPFR_ASSERTD(exp_r + 1 >= __gmpfr_emin);
     214            MPFR_SET_EXP (r, exp_r + 1);
     215          }
     216        MPFR_RET(1);
     217      }
     218  }
     219  
     220  /* Special code for prec(r) = prec(u) = GMP_NUMB_BITS. */
     221  static int
     222  mpfr_sqrt1n (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
     223  {
     224    mpfr_prec_t exp_u = MPFR_EXP(u), exp_r;
     225    mp_limb_t u0, r0, rb, sb, low;
     226    mpfr_limb_ptr rp = MPFR_MANT(r);
     227  
     228    MPFR_STAT_STATIC_ASSERT (GMP_NUMB_BITS == 64);
     229    MPFR_ASSERTD(MPFR_PREC(r) == GMP_NUMB_BITS);
     230    MPFR_ASSERTD(MPFR_PREC(u) <= GMP_NUMB_BITS);
     231  
     232    /* first make the exponent even */
     233    u0 = MPFR_MANT(u)[0];
     234    if (((unsigned int) exp_u & 1) != 0)
     235      {
     236        low = u0 << (GMP_NUMB_BITS - 1);
     237        u0 >>= 1;
     238        exp_u ++;
     239      }
     240    else
     241      low = 0; /* low part of u0 */
     242    MPFR_ASSERTD (((unsigned int) exp_u & 1) == 0);
     243    exp_r = exp_u / 2;
     244  
     245    /* then compute an approximation of the integer square root of
     246       u0*2^GMP_NUMB_BITS */
     247    __gmpfr_sqrt_limb_approx (r0, u0);
     248  
     249    /* the exact square root is in [r0, r0 + 7] */
     250  
     251    /* As shown in mpfr_sqrt1 above, r0 has its most significant bit set */
     252    MPFR_ASSERTD(r0 >= MPFR_LIMB_HIGHBIT);
     253  
     254    umul_ppmm (rb, sb, r0, r0);
     255    sub_ddmmss (rb, sb, u0, low, rb, sb);
     256    /* for the exact square root, we should have 0 <= rb:sb <= 2*r0 */
     257    while (!(rb == 0 || (rb == 1 && sb <= 2 * r0)))
     258      {
     259        /* subtract 2*r0+1 from rb:sb: subtract r0 before incrementing r0,
     260           then r0 after (which is r0+1) */
     261        rb -= (sb < r0);
     262        sb -= r0;
     263        r0 ++;
     264        rb -= (sb < r0);
     265        sb -= r0;
     266      }
     267    /* now we have u0*2^64+low = r0^2 + rb*2^64+sb, with rb*2^64+sb <= 2*r0 */
     268    MPFR_ASSERTD(rb == 0 || (rb == 1 && sb <= 2 * r0));
     269  
     270    /* We can't have the middle case u0*2^64 = (r0 + 1/2)^2 since
     271       (r0 + 1/2)^2 is not an integer.
     272       We thus rb = 1 whenever u0*2^64 > (r0 + 1/2)^2, thus rb*2^64 + sb > r0
     273       and the sticky bit is always 1, unless we had rb = sb = 0. */
     274  
     275    rb = rb || (sb > r0);
     276    sb = rb | sb;
     277    rp[0] = r0;
     278  
     279    /* rounding */
     280  
     281    /* Note: if 1 and 2 are in [emin,emax], no overflow nor underflow
     282       is possible */
     283    if (MPFR_UNLIKELY (exp_r > __gmpfr_emax))
     284      return mpfr_overflow (r, rnd_mode, 1);
     285  
     286    /* See comments in mpfr_div_1 */
     287    if (MPFR_UNLIKELY (exp_r < __gmpfr_emin))
     288      {
     289        if (rnd_mode == MPFR_RNDN)
     290          {
     291            /* the case rp[0] = 111...111 and rb = 1 cannot happen, since it
     292               would imply u0 >= (2^64-1/2)^2/2^64 thus u0 >= 2^64 */
     293            if (exp_r < __gmpfr_emin - 1 ||
     294                (rp[0] == MPFR_LIMB_HIGHBIT && sb == 0))
     295              rnd_mode = MPFR_RNDZ;
     296          }
     297        else if (MPFR_IS_LIKE_RNDA(rnd_mode, 0))
     298          {
     299            if (exp_r == __gmpfr_emin - 1 &&
     300                rp[0] == MPFR_LIMB_MAX &&
     301                (rb | sb) != 0)
     302              goto rounding; /* no underflow */
     303          }
     304        return mpfr_underflow (r, rnd_mode, 1);
     305      }
     306  
     307    /* sb = 0 can only occur when the square root is exact, i.e., rb = 0 */
     308  
     309   rounding:
     310    MPFR_EXP (r) = exp_r;
     311    if (sb == 0 /* implies rb = 0 */ || rnd_mode == MPFR_RNDF)
     312      {
     313        MPFR_ASSERTD(exp_r >= __gmpfr_emin);
     314        MPFR_ASSERTD(exp_r <= __gmpfr_emax);
     315        MPFR_RET (0);
     316      }
     317    else if (rnd_mode == MPFR_RNDN)
     318      {
     319        /* we can't have sb = 0, thus rb is enough */
     320        if (rb == 0)
     321          goto truncate;
     322        else
     323          goto add_one_ulp;
     324      }
     325    else if (MPFR_IS_LIKE_RNDZ(rnd_mode, 0))
     326      {
     327      truncate:
     328        MPFR_ASSERTD(exp_r >= __gmpfr_emin);
     329        MPFR_ASSERTD(exp_r <= __gmpfr_emax);
     330        MPFR_RET(-1);
     331      }
     332    else /* round away from zero */
     333      {
     334      add_one_ulp:
     335        rp[0] += MPFR_LIMB_ONE;
     336        if (rp[0] == 0)
     337          {
     338            rp[0] = MPFR_LIMB_HIGHBIT;
     339            if (MPFR_UNLIKELY(exp_r + 1 > __gmpfr_emax))
     340              return mpfr_overflow (r, rnd_mode, 1);
     341            MPFR_ASSERTD(exp_r + 1 <= __gmpfr_emax);
     342            MPFR_ASSERTD(exp_r + 1 >= __gmpfr_emin);
     343            MPFR_SET_EXP (r, exp_r + 1);
     344          }
     345        MPFR_RET(1);
     346      }
     347  }
     348  
     349  /* Special code for GMP_NUMB_BITS < prec(r) = prec(u) < 2*GMP_NUMB_BITS.
     350     Assumes GMP_NUMB_BITS=64. */
     351  static int
     352  mpfr_sqrt2 (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
     353  {
     354    mpfr_prec_t p = MPFR_GET_PREC(r);
     355    mpfr_limb_ptr up = MPFR_MANT(u), rp = MPFR_MANT(r);
     356    mp_limb_t np[4], rb, sb, mask;
     357    mpfr_prec_t exp_u = MPFR_EXP(u), exp_r, sh = 2 * GMP_NUMB_BITS - p;
     358  
     359    MPFR_STAT_STATIC_ASSERT (GMP_NUMB_BITS == 64);
     360  
     361    if (((unsigned int) exp_u & 1) != 0)
     362      {
     363        np[3] = up[1] >> 1;
     364        np[2] = (up[1] << (GMP_NUMB_BITS - 1)) | (up[0] >> 1);
     365        np[1] = up[0] << (GMP_NUMB_BITS - 1);
     366        exp_u ++;
     367      }
     368    else
     369      {
     370        np[3] = up[1];
     371        np[2] = up[0];
     372        np[1] = 0;
     373      }
     374    exp_r = exp_u / 2;
     375  
     376    mask = MPFR_LIMB_MASK(sh);
     377  
     378    mpfr_sqrt2_approx (rp, np + 2);
     379    /* with n = np[3]*2^64+np[2], we have:
     380       {rp, 2} - 4 <= floor(sqrt(2^128*n)) <= {rp, 2} + 26, thus we can round
     381       correctly except when the number formed by the last sh-1 bits
     382       of rp[0] is in the range [-26, 4]. */
     383    if (MPFR_LIKELY(((rp[0] + 26) & (mask >> 1)) > 30))
     384      sb = 1;
     385    else
     386      {
     387        mp_limb_t tp[4], h, l;
     388  
     389        np[0] = 0;
     390        mpn_sqr (tp, rp, 2);
     391        /* since we know s - 26 <= r <= s + 4 and 0 <= n^2 - s <= 2*s, we have
     392           -8*s-16 <= n - r^2 <= 54*s - 676, thus it suffices to compute
     393           n - r^2 modulo 2^192 */
     394        mpn_sub_n (tp, np, tp, 3);
     395        /* invariant: h:l = 2 * {rp, 2}, with upper bit implicit */
     396        h = (rp[1] << 1) | (rp[0] >> (GMP_NUMB_BITS - 1));
     397        l = rp[0] << 1;
     398        while ((mp_limb_signed_t) tp[2] < 0) /* approximation was too large */
     399          {
     400            /* subtract 1 to {rp, 2}, thus 2 to h:l */
     401            h -= (l <= MPFR_LIMB_ONE);
     402            l -= 2;
     403            /* add (1:h:l)+1 to {tp,3} */
     404            tp[0] += l + 1;
     405            tp[1] += h + (tp[0] < l);
     406            /* necessarily rp[1] has its most significant bit set */
     407            tp[2] += MPFR_LIMB_ONE + (tp[1] < h || (tp[1] == h && tp[0] < l));
     408          }
     409        /* now tp[2] >= 0 */
     410        /* now we want {tp, 4} <= 2 * {rp, 2}, which implies tp[2] <= 1 */
     411        while (tp[2] > 1 || (tp[2] == 1 && tp[1] > h) ||
     412               (tp[2] == 1 && tp[1] == h && tp[0] > l))
     413          {
     414            /* subtract (1:h:l)+1 from {tp,3} */
     415            tp[2] -= MPFR_LIMB_ONE + (tp[1] < h || (tp[1] == h && tp[0] <= l));
     416            tp[1] -= h + (tp[0] <= l);
     417            tp[0] -= l + 1;
     418            /* add 2 to  h:l */
     419            l += 2;
     420            h += (l <= MPFR_LIMB_ONE);
     421          }
     422        /* restore {rp, 2} from h:l */
     423        rp[1] = MPFR_LIMB_HIGHBIT | (h >> 1);
     424        rp[0] = (h << (GMP_NUMB_BITS - 1)) | (l >> 1);
     425        sb = tp[2] | tp[0] | tp[1];
     426      }
     427  
     428    rb = rp[0] & (MPFR_LIMB_ONE << (sh - 1));
     429    sb |= (rp[0] & mask) ^ rb;
     430    rp[0] = rp[0] & ~mask;
     431  
     432    /* rounding */
     433    if (MPFR_UNLIKELY (exp_r > __gmpfr_emax))
     434      return mpfr_overflow (r, rnd_mode, 1);
     435  
     436    /* See comments in mpfr_div_1 */
     437    if (MPFR_UNLIKELY (exp_r < __gmpfr_emin))
     438      {
     439        if (rnd_mode == MPFR_RNDN)
     440          {
     441            if (exp_r < __gmpfr_emin - 1 || (rp[1] == MPFR_LIMB_HIGHBIT &&
     442                                             rp[0] == MPFR_LIMB_ZERO && sb == 0))
     443              rnd_mode = MPFR_RNDZ;
     444          }
     445        else if (MPFR_IS_LIKE_RNDA(rnd_mode, 0))
     446          {
     447            if (exp_r == __gmpfr_emin - 1 && (rp[1] == MPFR_LIMB_MAX &&
     448                                              rp[0] == ~mask) && (rb | sb))
     449              goto rounding; /* no underflow */
     450          }
     451        return mpfr_underflow (r, rnd_mode, 1);
     452      }
     453  
     454   rounding:
     455    MPFR_EXP (r) = exp_r;
     456    if (sb == 0 /* implies rb = 0 */ || rnd_mode == MPFR_RNDF)
     457      {
     458        MPFR_ASSERTD(exp_r >= __gmpfr_emin);
     459        MPFR_ASSERTD(exp_r <= __gmpfr_emax);
     460        MPFR_RET (0);
     461      }
     462    else if (rnd_mode == MPFR_RNDN)
     463      {
     464        /* since sb <> 0 now, only rb is needed */
     465        if (rb == 0)
     466          goto truncate;
     467        else
     468          goto add_one_ulp;
     469      }
     470    else if (MPFR_IS_LIKE_RNDZ(rnd_mode, 0))
     471      {
     472      truncate:
     473        MPFR_ASSERTD(exp_r >= __gmpfr_emin);
     474        MPFR_ASSERTD(exp_r <= __gmpfr_emax);
     475        MPFR_RET(-1);
     476      }
     477    else /* round away from zero */
     478      {
     479      add_one_ulp:
     480        rp[0] += MPFR_LIMB_ONE << sh;
     481        rp[1] += rp[0] == 0;
     482        if (rp[1] == 0)
     483          {
     484            rp[1] = MPFR_LIMB_HIGHBIT;
     485            if (MPFR_UNLIKELY(exp_r + 1 > __gmpfr_emax))
     486              return mpfr_overflow (r, rnd_mode, 1);
     487            MPFR_ASSERTD(exp_r + 1 <= __gmpfr_emax);
     488            MPFR_ASSERTD(exp_r + 1 >= __gmpfr_emin);
     489            MPFR_SET_EXP (r, exp_r + 1);
     490          }
     491        MPFR_RET(1);
     492      }
     493  }
     494  
     495  #endif /* !defined(MPFR_GENERIC_ABI) && GMP_NUMB_BITS == 64 */
     496  
     497  int
     498  mpfr_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
     499  {
     500    mp_size_t rsize; /* number of limbs of r (plus 1 if exact limb multiple) */
     501    mp_size_t rrsize;
     502    mp_size_t usize; /* number of limbs of u */
     503    mp_size_t tsize; /* number of limbs of the sqrtrem remainder */
     504    mp_size_t k;
     505    mp_size_t l;
     506    mpfr_limb_ptr rp, rp0;
     507    mpfr_limb_ptr up;
     508    mpfr_limb_ptr sp;
     509    mp_limb_t sticky0; /* truncated part of input */
     510    mp_limb_t sticky1; /* truncated part of rp[0] */
     511    mp_limb_t sticky;
     512    int odd_exp;
     513    int sh; /* number of extra bits in rp[0] */
     514    int inexact; /* return ternary flag */
     515    mpfr_exp_t expr;
     516    mpfr_prec_t rq = MPFR_GET_PREC (r);
     517    MPFR_TMP_DECL(marker);
     518  
     519    MPFR_LOG_FUNC
     520      (("x[%Pd]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
     521       ("y[%Pd]=%.*Rg inexact=%d",
     522        mpfr_get_prec (r), mpfr_log_prec, r, inexact));
     523  
     524    if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
     525      {
     526        if (MPFR_IS_NAN(u))
     527          {
     528            MPFR_SET_NAN(r);
     529            MPFR_RET_NAN;
     530          }
     531        else if (MPFR_IS_ZERO(u))
     532          {
     533            /* 0+ or 0- */
     534            MPFR_SET_SAME_SIGN(r, u);
     535            MPFR_SET_ZERO(r);
     536            MPFR_RET(0); /* zero is exact */
     537          }
     538        else
     539          {
     540            MPFR_ASSERTD(MPFR_IS_INF(u));
     541            /* sqrt(-Inf) = NAN */
     542            if (MPFR_IS_NEG(u))
     543              {
     544                MPFR_SET_NAN(r);
     545                MPFR_RET_NAN;
     546              }
     547            MPFR_SET_POS(r);
     548            MPFR_SET_INF(r);
     549            MPFR_RET(0);
     550          }
     551      }
     552    if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
     553      {
     554        MPFR_SET_NAN(r);
     555        MPFR_RET_NAN;
     556      }
     557    MPFR_SET_POS(r);
     558  
     559  #if !defined(MPFR_GENERIC_ABI) && GMP_NUMB_BITS == 64
     560    {
     561      mpfr_prec_t uq = MPFR_GET_PREC (u);
     562  
     563      if (rq == uq)
     564        {
     565          if (rq < GMP_NUMB_BITS)
     566            return mpfr_sqrt1 (r, u, rnd_mode);
     567  
     568          if (GMP_NUMB_BITS < rq && rq < 2*GMP_NUMB_BITS)
     569            return mpfr_sqrt2 (r, u, rnd_mode);
     570  
     571          if (rq == GMP_NUMB_BITS)
     572            return mpfr_sqrt1n (r, u, rnd_mode);
     573        }
     574    }
     575  #endif
     576  
     577    MPFR_TMP_MARK (marker);
     578    MPFR_UNSIGNED_MINUS_MODULO (sh, rq);
     579    if (sh == 0 && rnd_mode == MPFR_RNDN)
     580      sh = GMP_NUMB_BITS; /* ugly case */
     581    rsize = MPFR_LIMB_SIZE(r) + (sh == GMP_NUMB_BITS);
     582    /* rsize is the number of limbs of r + 1 if exact limb multiple and rounding
     583       to nearest, this is the number of wanted limbs for the square root */
     584    rrsize = rsize + rsize;
     585    usize = MPFR_LIMB_SIZE(u); /* number of limbs of u */
     586    rp0 = MPFR_MANT(r);
     587    rp = (sh < GMP_NUMB_BITS) ? rp0 : MPFR_TMP_LIMBS_ALLOC (rsize);
     588    up = MPFR_MANT(u);
     589    sticky0 = MPFR_LIMB_ZERO; /* truncated part of input */
     590    sticky1 = MPFR_LIMB_ZERO; /* truncated part of rp[0] */
     591    odd_exp = (unsigned int) MPFR_GET_EXP (u) & 1;
     592    inexact = -1; /* return ternary flag */
     593  
     594    sp = MPFR_TMP_LIMBS_ALLOC (rrsize);
     595  
     596    /* copy the most significant limbs of u to {sp, rrsize} */
     597    if (MPFR_LIKELY(usize <= rrsize)) /* in case r and u have the same precision,
     598                                         we have indeed rrsize = 2 * usize */
     599      {
     600        k = rrsize - usize;
     601        if (MPFR_LIKELY(k))
     602          MPN_ZERO (sp, k);
     603        if (odd_exp)
     604          {
     605            if (MPFR_LIKELY(k))
     606              sp[k - 1] = mpn_rshift (sp + k, up, usize, 1);
     607            else
     608              sticky0 = mpn_rshift (sp, up, usize, 1);
     609          }
     610        else
     611          MPN_COPY (sp + rrsize - usize, up, usize);
     612      }
     613    else /* usize > rrsize: truncate the input */
     614      {
     615        k = usize - rrsize;
     616        if (odd_exp)
     617          sticky0 = mpn_rshift (sp, up + k, rrsize, 1);
     618        else
     619          MPN_COPY (sp, up + k, rrsize);
     620        l = k;
     621        while (sticky0 == MPFR_LIMB_ZERO && l != 0)
     622          sticky0 = up[--l];
     623      }
     624  
     625    /* sticky0 is non-zero iff the truncated part of the input is non-zero */
     626  
     627    tsize = mpn_sqrtrem (rp, NULL, sp, rrsize);
     628  
     629    /* a return value of zero in mpn_sqrtrem indicates a perfect square */
     630    sticky = sticky0 || tsize != 0;
     631  
     632    /* truncate low bits of rp[0] */
     633    sticky1 = rp[0] & ((sh < GMP_NUMB_BITS) ? MPFR_LIMB_MASK(sh)
     634                       : MPFR_LIMB_MAX);
     635    rp[0] -= sticky1;
     636  
     637    sticky = sticky || sticky1;
     638  
     639    expr = (MPFR_GET_EXP(u) + odd_exp) / 2;  /* exact */
     640  
     641    if (rnd_mode == MPFR_RNDZ || rnd_mode == MPFR_RNDD ||
     642        sticky == MPFR_LIMB_ZERO)
     643      {
     644        inexact = (sticky == MPFR_LIMB_ZERO) ? 0 : -1;
     645        goto truncate;
     646      }
     647    else if (rnd_mode == MPFR_RNDN)
     648      {
     649        /* if sh < GMP_NUMB_BITS, the round bit is bit (sh-1) of sticky1
     650                    and the sticky bit is formed by the low sh-1 bits from
     651                    sticky1, together with the sqrtrem remainder and sticky0. */
     652        if (sh < GMP_NUMB_BITS)
     653          {
     654            if (sticky1 & (MPFR_LIMB_ONE << (sh - 1)))
     655              { /* round bit is set */
     656                if (sticky1 == (MPFR_LIMB_ONE << (sh - 1)) && tsize == 0
     657                    && sticky0 == 0)
     658                  goto even_rule;
     659                else
     660                  goto add_one_ulp;
     661              }
     662            else /* round bit is zero */
     663              goto truncate; /* with the default inexact=-1 */
     664          }
     665        else /* sh = GMP_NUMB_BITS: the round bit is the most significant bit
     666                of rp[0], and the remaining GMP_NUMB_BITS-1 bits contribute to
     667                the sticky bit */
     668          {
     669            if (sticky1 & MPFR_LIMB_HIGHBIT)
     670              { /* round bit is set */
     671                if (sticky1 == MPFR_LIMB_HIGHBIT && tsize == 0 && sticky0 == 0)
     672                  goto even_rule;
     673                else
     674                  goto add_one_ulp;
     675              }
     676            else /* round bit is zero */
     677              goto truncate; /* with the default inexact=-1 */
     678          }
     679      }
     680    else /* rnd_mode=GMP_RDNU, necessarily sticky <> 0, thus add 1 ulp */
     681      goto add_one_ulp;
     682  
     683   even_rule: /* has to set inexact */
     684    if (sh < GMP_NUMB_BITS)
     685      inexact = (rp[0] & (MPFR_LIMB_ONE << sh)) ? 1 : -1;
     686    else
     687      inexact = (rp[1] & MPFR_LIMB_ONE) ? 1 : -1;
     688    if (inexact == -1)
     689      goto truncate;
     690    /* else go through add_one_ulp */
     691  
     692   add_one_ulp:
     693    inexact = 1; /* always here */
     694    if (sh == GMP_NUMB_BITS)
     695      {
     696        rp ++;
     697        rsize --;
     698        sh = 0;
     699      }
     700    /* now rsize = MPFR_LIMB_SIZE(r) */
     701    if (mpn_add_1 (rp0, rp, rsize, MPFR_LIMB_ONE << sh))
     702      {
     703        expr ++;
     704        rp0[rsize - 1] = MPFR_LIMB_HIGHBIT;
     705      }
     706    goto end;
     707  
     708   truncate: /* inexact = 0 or -1 */
     709    if (sh == GMP_NUMB_BITS)
     710      MPN_COPY (rp0, rp + 1, rsize - 1);
     711  
     712   end:
     713    /* Do not use MPFR_SET_EXP because the range has not been checked yet. */
     714    MPFR_ASSERTN (expr >= MPFR_EMIN_MIN && expr <= MPFR_EMAX_MAX);
     715    MPFR_EXP (r) = expr;
     716    MPFR_TMP_FREE(marker);
     717  
     718    return mpfr_check_range (r, inexact, rnd_mode);
     719  }