1  /* mpfr_sqr -- Floating-point square
       2  
       3  Copyright 2004-2023 Free Software Foundation, Inc.
       4  Contributed by the AriC and Caramba projects, INRIA.
       5  
       6  This file is part of the GNU MPFR Library.
       7  
       8  The GNU MPFR Library is free software; you can redistribute it and/or modify
       9  it under the terms of the GNU Lesser General Public License as published by
      10  the Free Software Foundation; either version 3 of the License, or (at your
      11  option) any later version.
      12  
      13  The GNU MPFR Library is distributed in the hope that it will be useful, but
      14  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      15  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
      16  License for more details.
      17  
      18  You should have received a copy of the GNU Lesser General Public License
      19  along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
      20  https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
      21  51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
      22  
      23  #define MPFR_NEED_LONGLONG_H
      24  #include "mpfr-impl.h"
      25  
      26  #if !defined(MPFR_GENERIC_ABI) && (GMP_NUMB_BITS == 32 || GMP_NUMB_BITS == 64)
      27  
      28  /* Special code for prec(a) < GMP_NUMB_BITS and prec(b) <= GMP_NUMB_BITS.
      29     Note: this function was copied from mpfr_mul_1 in file mul.c, thus any
      30     change here should be done also in mpfr_mul_1.
      31     Although this function works as soon as prec(a) < GMP_NUMB_BITS and
      32     prec(b) <= GMP_NUMB_BITS, we use it for prec(a)=prec(b) < GMP_NUMB_BITS. */
      33  static int
      34  mpfr_sqr_1 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
      35  {
      36    mp_limb_t a0;
      37    mpfr_limb_ptr ap = MPFR_MANT(a);
      38    mp_limb_t b0 = MPFR_MANT(b)[0];
      39    mpfr_exp_t ax;
      40    mpfr_prec_t sh = GMP_NUMB_BITS - p;
      41    mp_limb_t rb, sb, mask = MPFR_LIMB_MASK(sh);
      42  
      43    /* When prec(b) <= GMP_NUMB_BITS / 2, we could replace umul_ppmm
      44       by a limb multiplication as follows, but we assume umul_ppmm is as fast
      45       as a limb multiplication on modern processors:
      46        a0 = (b0 >> (GMP_NUMB_BITS / 2)) * (b0 >> (GMP_NUMB_BITS / 2));
      47        sb = 0;
      48    */
      49    ax = MPFR_GET_EXP(b) * 2;
      50    umul_ppmm (a0, sb, b0, b0);
      51    if (a0 < MPFR_LIMB_HIGHBIT)
      52      {
      53        ax --;
      54        a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
      55        sb <<= 1;
      56      }
      57    rb = a0 & (MPFR_LIMB_ONE << (sh - 1));
      58    sb |= (a0 & mask) ^ rb;
      59    ap[0] = a0 & ~mask;
      60  
      61    MPFR_SIGN(a) = MPFR_SIGN_POS;
      62  
      63    /* rounding */
      64    if (MPFR_UNLIKELY(ax > __gmpfr_emax))
      65      return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
      66  
      67    /* Warning: underflow should be checked *after* rounding, thus when rounding
      68       away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
      69       a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
      70    if (MPFR_UNLIKELY(ax < __gmpfr_emin))
      71      {
      72        /* Note: for emin=2*k+1, a >= 0.111...111*2^(emin-1) is not possible,
      73           i.e., a >= (1 - 2^(-p))*2^(2k), since we need a = b^2 with EXP(b)=k,
      74           and the largest such b is (1 - 2^(-p))*2^k satisfies
      75           b^2 < (1 - 2^(-p))*2^(2k).
      76           For emin=2*k, it is only possible for some values of p: it is not
      77           possible for p=53, because the largest significand is 6369051672525772
      78           but its square has only 52 leading ones. For p=24 it is possible,
      79           with b = 11863283, whose square has 24 leading ones. */
      80        if (ax == __gmpfr_emin - 1 && ap[0] == ~mask &&
      81            ((rnd_mode == MPFR_RNDN && rb) ||
      82             (MPFR_IS_LIKE_RNDA (rnd_mode, 0) && (rb | sb))))
      83          goto rounding; /* no underflow */
      84        /* For RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
      85           we have to change to RNDZ. This corresponds to:
      86           (a) either ax < emin - 1
      87           (b) or ax = emin - 1 and ap[0] = 1000....000 and rb = sb = 0 */
      88        if (rnd_mode == MPFR_RNDN &&
      89            (ax < __gmpfr_emin - 1 ||
      90             (ap[0] == MPFR_LIMB_HIGHBIT && (rb | sb) == 0)))
      91          rnd_mode = MPFR_RNDZ;
      92        return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
      93      }
      94  
      95   rounding:
      96    MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
      97                          in the cases "goto rounding" above. */
      98    if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
      99      {
     100        MPFR_ASSERTD(ax >= __gmpfr_emin);
     101        MPFR_RET (0);
     102      }
     103    else if (rnd_mode == MPFR_RNDN)
     104      {
     105        if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
     106          goto truncate;
     107        else
     108          goto add_one_ulp;
     109      }
     110    else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
     111      {
     112      truncate:
     113        MPFR_ASSERTD(ax >= __gmpfr_emin);
     114        MPFR_RET(-MPFR_SIGN_POS);
     115      }
     116    else /* round away from zero */
     117      {
     118      add_one_ulp:
     119        ap[0] += MPFR_LIMB_ONE << sh;
     120        if (ap[0] == 0)
     121          {
     122            ap[0] = MPFR_LIMB_HIGHBIT;
     123            if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
     124              return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     125            MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
     126            MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
     127            MPFR_SET_EXP (a, ax + 1);
     128          }
     129        MPFR_RET(MPFR_SIGN_POS);
     130      }
     131  }
     132  
     133  /* special code for PREC(a) = GMP_NUMB_BITS */
     134  static int
     135  mpfr_sqr_1n (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
     136  {
     137    mp_limb_t a0;
     138    mpfr_limb_ptr ap = MPFR_MANT(a);
     139    mp_limb_t b0 = MPFR_MANT(b)[0];
     140    mpfr_exp_t ax;
     141    mp_limb_t rb, sb;
     142  
     143    ax = MPFR_GET_EXP(b) * 2;
     144    umul_ppmm (a0, sb, b0, b0);
     145    if (a0 < MPFR_LIMB_HIGHBIT)
     146      {
     147        ax --;
     148        a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
     149        sb <<= 1;
     150      }
     151    rb = sb & MPFR_LIMB_HIGHBIT;
     152    sb = sb & ~MPFR_LIMB_HIGHBIT;
     153    ap[0] = a0;
     154  
     155    MPFR_SIGN(a) = MPFR_SIGN_POS;
     156  
     157    /* rounding */
     158    if (MPFR_UNLIKELY(ax > __gmpfr_emax))
     159      return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     160  
     161    /* Warning: underflow should be checked *after* rounding, thus when rounding
     162       away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
     163       a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
     164    if (MPFR_UNLIKELY(ax < __gmpfr_emin))
     165      {
     166        /* As seen in mpfr_mul_1, we cannot have a0 = 111...111 here if there
     167           was not an exponent decrease (ax--) above.
     168           In the case of an exponent decrease:
     169           - For GMP_NUMB_BITS=32, a0 = 111...111 is not possible since the
     170             largest b0 such that b0^2 < 2^(2*32-1) is b0=3037000499, but
     171             its square has only 30 leading ones.
     172           - For GMP_NUMB_BITS=64, a0 = 111...111 is possible: the largest b0
     173             is 13043817825332782212, and its square has 64 leading ones; but
     174             since the next bit is rb=0, for RNDN, we always have an underflow.
     175           For the test below, note that a is positive.
     176        */
     177        if (ax == __gmpfr_emin - 1 && ap[0] == MPFR_LIMB_MAX &&
     178            MPFR_IS_LIKE_RNDA (rnd_mode, 0))
     179          goto rounding; /* no underflow */
     180        /* For RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
     181           we have to change to RNDZ. This corresponds to:
     182           (a) either ax < emin - 1
     183           (b) or ax = emin - 1 and ap[0] = 1000....000 and rb = sb = 0 */
     184        if (rnd_mode == MPFR_RNDN &&
     185            (ax < __gmpfr_emin - 1 ||
     186             (ap[0] == MPFR_LIMB_HIGHBIT && (rb | sb) == 0)))
     187          rnd_mode = MPFR_RNDZ;
     188        return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
     189      }
     190  
     191   rounding:
     192    MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
     193                          in the cases "goto rounding" above. */
     194    if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
     195      {
     196        MPFR_ASSERTD(ax >= __gmpfr_emin);
     197        MPFR_RET (0);
     198      }
     199    else if (rnd_mode == MPFR_RNDN)
     200      {
     201        if (rb == 0 || (sb == 0 && (ap[0] & MPFR_LIMB_ONE) == 0))
     202          goto truncate;
     203        else
     204          goto add_one_ulp;
     205      }
     206    else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
     207      {
     208      truncate:
     209        MPFR_ASSERTD(ax >= __gmpfr_emin);
     210        MPFR_RET(-MPFR_SIGN_POS);
     211      }
     212    else /* round away from zero */
     213      {
     214      add_one_ulp:
     215        ap[0] += MPFR_LIMB_ONE;
     216        if (ap[0] == 0)
     217          {
     218            ap[0] = MPFR_LIMB_HIGHBIT;
     219            if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
     220              return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     221            MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
     222            MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
     223            MPFR_SET_EXP (a, ax + 1);
     224          }
     225        MPFR_RET(MPFR_SIGN_POS);
     226      }
     227  }
     228  
     229  /* Special code for GMP_NUMB_BITS < prec(a) < 2*GMP_NUMB_BITS and
     230     GMP_NUMB_BITS < prec(b) <= 2*GMP_NUMB_BITS.
     231     Note: this function was copied and optimized from mpfr_mul_2 in file mul.c,
     232     thus any change here should be done also in mpfr_mul_2, if applicable. */
     233  static int
     234  mpfr_sqr_2 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
     235  {
     236    mp_limb_t h, l, u, v;
     237    mpfr_limb_ptr ap = MPFR_MANT(a);
     238    mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
     239    mpfr_prec_t sh = 2 * GMP_NUMB_BITS - p;
     240    mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
     241    mp_limb_t *bp = MPFR_MANT(b);
     242  
     243    /* we store the 4-limb product in h=ap[1], l=ap[0], sb=ap[-1], sb2=ap[-2] */
     244    umul_ppmm (h, l, bp[1], bp[1]);
     245    umul_ppmm (u, v, bp[1], bp[0]);
     246    l += u << 1;
     247    h += (l < (u << 1)) + (u >> (GMP_NUMB_BITS - 1));
     248  
     249    /* now the full square is {h, l, 2*v + high(b0*c0), low(b0*c0)},
     250       where the lower part contributes to less than 3 ulps to {h, l} */
     251  
     252    /* If h has its most significant bit set and the low sh-1 bits of l are not
     253       000...000 nor 111...111 nor 111...110, then we can round correctly;
     254       if h has zero as most significant bit, we have to shift left h and l,
     255       thus if the low sh-2 bits are not 000...000 nor 111...111 nor 111...110,
     256       then we can round correctly. To avoid an extra test we consider the latter
     257       case (if we can round, we can also round in the former case).
     258       For sh <= 3, we have mask <= 7, thus (mask>>2) <= 1, and the approximation
     259       cannot be enough. */
     260    if (MPFR_LIKELY(((l + 2) & (mask >> 2)) > 2))
     261      sb = sb2 = 1; /* result cannot be exact in that case */
     262    else
     263      {
     264        mp_limb_t carry1, carry2;
     265  
     266        umul_ppmm (sb, sb2, bp[0], bp[0]);
     267        /* the full product is {h, l, sb + v + w, sb2} */
     268        ADD_LIMB (sb, v, carry1);
     269        ADD_LIMB (l, carry1, carry2);
     270        h += carry2;
     271        ADD_LIMB (sb, v, carry1);
     272        ADD_LIMB (l, carry1, carry2);
     273        h += carry2;
     274      }
     275    if (h < MPFR_LIMB_HIGHBIT)
     276      {
     277        ax --;
     278        h = (h << 1) | (l >> (GMP_NUMB_BITS - 1));
     279        l = (l << 1) | (sb >> (GMP_NUMB_BITS - 1));
     280        sb <<= 1;
     281        /* no need to shift sb2 since we only want to know if it is zero or not */
     282      }
     283    ap[1] = h;
     284    rb = l & (MPFR_LIMB_ONE << (sh - 1));
     285    sb |= ((l & mask) ^ rb) | sb2;
     286    ap[0] = l & ~mask;
     287  
     288    MPFR_SIGN(a) = MPFR_SIGN_POS;
     289  
     290    /* rounding */
     291    if (MPFR_UNLIKELY(ax > __gmpfr_emax))
     292      return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     293  
     294    /* Warning: underflow should be checked *after* rounding, thus when rounding
     295       away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
     296       a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
     297    if (MPFR_UNLIKELY(ax < __gmpfr_emin))
     298      {
     299        /* Note: like for mpfr_sqr_1, the case
     300           0.111...111*2^(emin-1) < a < 2^(emin-1) is not possible when emin is
     301           odd, since (modulo a shift) this would imply 1-2^(-p) < a = b^2 < 1,
     302           and this is not possible with 1-2^(-p) <= b < 1.
     303           For emin even, it is possible for some values of p, for example for
     304           p=69 with b=417402170410649030795*2^k. */
     305        if (ax == __gmpfr_emin - 1 &&
     306            ap[1] == MPFR_LIMB_MAX &&
     307            ap[0] == ~mask &&
     308            ((rnd_mode == MPFR_RNDN && rb) ||
     309             (MPFR_IS_LIKE_RNDA (rnd_mode, 0) && (rb | sb))))
     310          goto rounding; /* no underflow */
     311        /* for RNDN, mpfr_underflow always rounds away, thus for
     312           |a| <= 2^(emin-2) we have to change to RNDZ */
     313        if (rnd_mode == MPFR_RNDN &&
     314            (ax < __gmpfr_emin - 1 ||
     315             (ap[1] == MPFR_LIMB_HIGHBIT && ap[0] == 0 && (rb | sb) == 0)))
     316          rnd_mode = MPFR_RNDZ;
     317        return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
     318      }
     319  
     320   rounding:
     321    MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
     322                          in the cases "goto rounding" above. */
     323    if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
     324      {
     325        MPFR_ASSERTD(ax >= __gmpfr_emin);
     326        MPFR_RET (0);
     327      }
     328    else if (rnd_mode == MPFR_RNDN)
     329      {
     330        if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
     331          goto truncate;
     332        else
     333          goto add_one_ulp;
     334      }
     335    else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
     336      {
     337      truncate:
     338        MPFR_ASSERTD(ax >= __gmpfr_emin);
     339        MPFR_RET(-MPFR_SIGN_POS);
     340      }
     341    else /* round away from zero */
     342      {
     343      add_one_ulp:
     344        ap[0] += MPFR_LIMB_ONE << sh;
     345        ap[1] += (ap[0] == 0);
     346        if (ap[1] == 0)
     347          {
     348            ap[1] = MPFR_LIMB_HIGHBIT;
     349            if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
     350              return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     351            MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
     352            MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
     353            MPFR_SET_EXP (a, ax + 1);
     354          }
     355        MPFR_RET(MPFR_SIGN_POS);
     356      }
     357  }
     358  
     359  /* Special code for 2*GMP_NUMB_BITS < prec(a) < 3*GMP_NUMB_BITS and
     360     2*GMP_NUMB_BITS < prec(b) <= 3*GMP_NUMB_BITS. */
     361  static int
     362  mpfr_sqr_3 (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode, mpfr_prec_t p)
     363  {
     364    mp_limb_t a0, a1, a2, h, l;
     365    mpfr_limb_ptr ap = MPFR_MANT(a);
     366    mpfr_exp_t ax = 2 * MPFR_GET_EXP(b);
     367    mpfr_prec_t sh = 3 * GMP_NUMB_BITS - p;
     368    mp_limb_t rb, sb, sb2, mask = MPFR_LIMB_MASK(sh);
     369    mp_limb_t *bp = MPFR_MANT(b);
     370  
     371    /* we store the upper 3-limb product in a2, a1, a0:
     372       b2^2, 2*b2*b1, 2*b2*b0+b1^2 */
     373  
     374    /* first compute b2*b1 and b2*b0, which will be shifted by 1 */
     375    umul_ppmm (a1, a0, bp[2], bp[1]);
     376    umul_ppmm (h, l, bp[2], bp[0]);
     377    a0 += h;
     378    a1 += (a0 < h);
     379    /* now a1, a0 contains b2*b1 + floor(b2*b0/B): there can be no overflow
     380       since b2*b1*B + b2*b0 <= b2*(b1*B+b0) <= b2*(B^2-1) < B^3 */
     381  
     382    /* multiply a2, a1, a0 by 2 */
     383    a2 = a1 >> (GMP_NUMB_BITS - 1);
     384    a1 = (a1 << 1) | (a0 >> (GMP_NUMB_BITS - 1));
     385    a0 = (a0 << 1);
     386  
     387    /* add b2^2 */
     388    umul_ppmm (h, l, bp[2], bp[2]);
     389    a1 += l;
     390    a2 += h + (a1 < l);
     391  
     392    /* add b1^2 */
     393    umul_ppmm (h, l, bp[1], bp[1]);
     394    a0 += h;
     395    a1 += (a0 < h);
     396    a2 += (a1 == 0 && a0 < h);
     397  
     398    /* Now the approximate product {a2, a1, a0} has an error of less than
     399       5 ulps (3 ulps for the ignored low limbs of 2*b2*b0+b1^2,
     400       plus 2 ulps for the ignored 2*b1*b0 (plus b0^2).
     401       Since we might shift by 1 bit, we make sure the low sh-2 bits of a0
     402       are not 0, -1, -2, -3 or -4. */
     403  
     404    if (MPFR_LIKELY(((a0 + 4) & (mask >> 2)) > 4))
     405      sb = sb2 = 1; /* result cannot be exact in that case */
     406    else
     407      {
     408        mp_limb_t t[6];
     409        mpn_sqr (t, bp, 3);
     410        a2 = t[5];
     411        a1 = t[4];
     412        a0 = t[3];
     413        sb = t[2];
     414        sb2 = t[1] | t[0];
     415      }
     416    if (a2 < MPFR_LIMB_HIGHBIT)
     417      {
     418        ax --;
     419        a2 = (a2 << 1) | (a1 >> (GMP_NUMB_BITS - 1));
     420        a1 = (a1 << 1) | (a0 >> (GMP_NUMB_BITS - 1));
     421        a0 = (a0 << 1) | (sb >> (GMP_NUMB_BITS - 1));
     422        sb <<= 1;
     423        /* no need to shift sb2: we only need to know if it is zero or not */
     424      }
     425    ap[2] = a2;
     426    ap[1] = a1;
     427    rb = a0 & (MPFR_LIMB_ONE << (sh - 1));
     428    sb |= ((a0 & mask) ^ rb) | sb2;
     429    ap[0] = a0 & ~mask;
     430  
     431    MPFR_SIGN(a) = MPFR_SIGN_POS;
     432  
     433    /* rounding */
     434    if (MPFR_UNLIKELY(ax > __gmpfr_emax))
     435      return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     436  
     437    /* Warning: underflow should be checked *after* rounding, thus when rounding
     438       away and when a > 0.111...111*2^(emin-1), or when rounding to nearest and
     439       a >= 0.111...111[1]*2^(emin-1), there is no underflow. */
     440    if (MPFR_UNLIKELY(ax < __gmpfr_emin))
     441      {
     442        if (ax == __gmpfr_emin - 1 &&
     443            ap[2] == MPFR_LIMB_MAX &&
     444            ap[1] == MPFR_LIMB_MAX &&
     445            ap[0] == ~mask &&
     446            ((rnd_mode == MPFR_RNDN && rb) ||
     447             (MPFR_IS_LIKE_RNDA (rnd_mode, 0) && (rb | sb))))
     448          goto rounding; /* no underflow */
     449        /* for RNDN, mpfr_underflow always rounds away, thus for |a| <= 2^(emin-2)
     450           we have to change to RNDZ */
     451        if (rnd_mode == MPFR_RNDN &&
     452            (ax < __gmpfr_emin - 1 ||
     453             (ap[2] == MPFR_LIMB_HIGHBIT && ap[1] == 0 && ap[0] == 0
     454              && (rb | sb) == 0)))
     455          rnd_mode = MPFR_RNDZ;
     456        return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
     457      }
     458  
     459   rounding:
     460    MPFR_EXP (a) = ax; /* Don't use MPFR_SET_EXP since ax might be < __gmpfr_emin
     461                          in the cases "goto rounding" above. */
     462    if ((rb == 0 && sb == 0) || rnd_mode == MPFR_RNDF)
     463      {
     464        MPFR_ASSERTD(ax >= __gmpfr_emin);
     465        MPFR_RET (0);
     466      }
     467    else if (rnd_mode == MPFR_RNDN)
     468      {
     469        if (rb == 0 || (sb == 0 && (ap[0] & (MPFR_LIMB_ONE << sh)) == 0))
     470          goto truncate;
     471        else
     472          goto add_one_ulp;
     473      }
     474    else if (MPFR_IS_LIKE_RNDZ (rnd_mode, 0))
     475      {
     476      truncate:
     477        MPFR_ASSERTD(ax >= __gmpfr_emin);
     478        MPFR_RET(-MPFR_SIGN_POS);
     479      }
     480    else /* round away from zero */
     481      {
     482      add_one_ulp:
     483        ap[0] += MPFR_LIMB_ONE << sh;
     484        ap[1] += (ap[0] == 0);
     485        ap[2] += (ap[1] == 0) && (ap[0] == 0);
     486        if (ap[2] == 0)
     487          {
     488            ap[2] = MPFR_LIMB_HIGHBIT;
     489            if (MPFR_UNLIKELY(ax + 1 > __gmpfr_emax))
     490              return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     491            MPFR_ASSERTD(ax + 1 <= __gmpfr_emax);
     492            MPFR_ASSERTD(ax + 1 >= __gmpfr_emin);
     493            MPFR_SET_EXP (a, ax + 1);
     494          }
     495        MPFR_RET(MPFR_SIGN_POS);
     496      }
     497  }
     498  
     499  #endif /* !defined(MPFR_GENERIC_ABI) && ... */
     500  
     501  /* Note: mpfr_sqr will call mpfr_mul if bn > MPFR_SQR_THRESHOLD,
     502     in order to use Mulders' mulhigh, which is handled only here
     503     to avoid partial code duplication. There is some overhead due
     504     to the additional tests, but slowdown should not be noticeable
     505     as this code is not executed in very small precisions. */
     506  
     507  int
     508  mpfr_sqr (mpfr_ptr a, mpfr_srcptr b, mpfr_rnd_t rnd_mode)
     509  {
     510    int cc, inexact;
     511    mpfr_exp_t ax;
     512    mp_limb_t *tmp;
     513    mp_limb_t b1;
     514    mpfr_prec_t aq, bq;
     515    mp_size_t bn, tn;
     516    MPFR_TMP_DECL(marker);
     517  
     518    MPFR_LOG_FUNC
     519      (("x[%Pd]=%.*Rg rnd=%d", mpfr_get_prec (b), mpfr_log_prec, b, rnd_mode),
     520       ("y[%Pd]=%.*Rg inexact=%d",
     521        mpfr_get_prec (a), mpfr_log_prec, a, inexact));
     522  
     523    /* deal with special cases */
     524    if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(b)))
     525      {
     526        if (MPFR_IS_NAN(b))
     527          {
     528            MPFR_SET_NAN(a);
     529            MPFR_RET_NAN;
     530          }
     531        MPFR_SET_POS (a);
     532        if (MPFR_IS_INF(b))
     533          MPFR_SET_INF(a);
     534        else
     535          ( MPFR_ASSERTD(MPFR_IS_ZERO(b)), MPFR_SET_ZERO(a) );
     536        MPFR_RET(0);
     537      }
     538    aq = MPFR_GET_PREC(a);
     539    bq = MPFR_GET_PREC(b);
     540  
     541  #if !defined(MPFR_GENERIC_ABI) && (GMP_NUMB_BITS == 32 || GMP_NUMB_BITS == 64)
     542    if (aq == bq)
     543      {
     544        if (aq < GMP_NUMB_BITS)
     545          return mpfr_sqr_1 (a, b, rnd_mode, aq);
     546  
     547        if (GMP_NUMB_BITS < aq && aq < 2 * GMP_NUMB_BITS)
     548          return mpfr_sqr_2 (a, b, rnd_mode, aq);
     549  
     550        if (aq == GMP_NUMB_BITS)
     551          return mpfr_sqr_1n (a, b, rnd_mode);
     552  
     553        if (2 * GMP_NUMB_BITS < aq && aq < 3 * GMP_NUMB_BITS)
     554          return mpfr_sqr_3 (a, b, rnd_mode, aq);
     555      }
     556  #endif
     557  
     558    ax = 2 * MPFR_GET_EXP (b);
     559    MPFR_ASSERTN (2 * (mpfr_uprec_t) bq <= MPFR_PREC_MAX);
     560  
     561    bn = MPFR_LIMB_SIZE (b); /* number of limbs of b */
     562    tn = MPFR_PREC2LIMBS (2 * bq); /* number of limbs of square,
     563                                      2*bn or 2*bn-1 */
     564  
     565    if (MPFR_UNLIKELY(bn > MPFR_SQR_THRESHOLD))
     566      /* the following line should not be replaced by mpfr_sqr,
     567         otherwise we'll get an infinite loop! */
     568      return mpfr_mul (a, b, b, rnd_mode);
     569  
     570    MPFR_TMP_MARK(marker);
     571    tmp = MPFR_TMP_LIMBS_ALLOC (2 * bn);
     572  
     573    /* Multiplies the mantissa in temporary allocated space */
     574    mpn_sqr (tmp, MPFR_MANT(b), bn);
     575    b1 = tmp[2 * bn - 1];
     576  
     577    /* now tmp[0]..tmp[2*bn-1] contains the product of both mantissa,
     578       with tmp[2*bn-1]>=2^(GMP_NUMB_BITS-2) */
     579    b1 >>= GMP_NUMB_BITS - 1; /* msb from the product */
     580  
     581    /* if the mantissas of b and c are uniformly distributed in ]1/2, 1],
     582       then their product is in ]1/4, 1/2] with probability 2*ln(2)-1 ~ 0.386
     583       and in [1/2, 1] with probability 2-2*ln(2) ~ 0.614 */
     584    tmp += 2 * bn - tn; /* +0 or +1 */
     585    if (MPFR_UNLIKELY(b1 == 0))
     586      mpn_lshift (tmp, tmp, tn, 1); /* tn <= k, so no stack corruption */
     587  
     588    cc = mpfr_round_raw (MPFR_MANT (a), tmp, 2 * bq, 0, aq, rnd_mode, &inexact);
     589    /* cc = 1 ==> result is a power of two */
     590    if (MPFR_UNLIKELY(cc))
     591      MPFR_MANT(a)[MPFR_LIMB_SIZE(a)-1] = MPFR_LIMB_HIGHBIT;
     592  
     593    MPFR_TMP_FREE(marker);
     594    {
     595      mpfr_exp_t ax2 = ax + ((int) b1 - 1 + cc);
     596      if (MPFR_UNLIKELY( ax2 > __gmpfr_emax))
     597        return mpfr_overflow (a, rnd_mode, MPFR_SIGN_POS);
     598      if (MPFR_UNLIKELY( ax2 < __gmpfr_emin))
     599        {
     600          /* In the rounding to the nearest mode, if the exponent of the exact
     601             result (i.e. before rounding, i.e. without taking cc into account)
     602             is < __gmpfr_emin - 1 or the exact result is a power of 2 (i.e. if
     603             both arguments are powers of 2), then round to zero. */
     604          if (rnd_mode == MPFR_RNDN &&
     605              (ax + (mpfr_exp_t) b1 < __gmpfr_emin || mpfr_powerof2_raw (b)))
     606            rnd_mode = MPFR_RNDZ;
     607          return mpfr_underflow (a, rnd_mode, MPFR_SIGN_POS);
     608        }
     609      MPFR_SET_EXP (a, ax2);
     610      MPFR_SET_POS (a);
     611    }
     612    MPFR_RET (inexact);
     613  }