1  /* mpfr_rec_sqrt -- inverse square root
       2  
       3  Copyright 2008-2023 Free Software Foundation, Inc.
       4  Contributed by the AriC and Caramba projects, INRIA.
       5  
       6  This file is part of the GNU MPFR Library.
       7  
       8  The GNU MPFR Library is free software; you can redistribute it and/or modify
       9  it under the terms of the GNU Lesser General Public License as published by
      10  the Free Software Foundation; either version 3 of the License, or (at your
      11  option) any later version.
      12  
      13  The GNU MPFR Library is distributed in the hope that it will be useful, but
      14  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      15  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
      16  License for more details.
      17  
      18  You should have received a copy of the GNU Lesser General Public License
      19  along with the GNU MPFR Library; see the file COPYING.LESSER.  If not, see
      20  https://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc.,
      21  51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */
      22  
      23  #define MPFR_NEED_LONGLONG_H /* for umul_ppmm */
      24  #include "mpfr-impl.h"
      25  
      26  #define LIMB_SIZE(x) ((((x)-1)>>MPFR_LOG2_GMP_NUMB_BITS) + 1)
      27  
      28  #define MPFR_COM_N(x,y,n)                               \
      29    do                                                    \
      30      {                                                   \
      31        mp_size_t i;                                      \
      32        for (i = 0; i < n; i++)                           \
      33          *((x)+i) = ~*((y)+i);                           \
      34      }                                                   \
      35    while (0)
      36  
      37  /* Put in X a p-bit approximation of 1/sqrt(A),
      38     where X = {x, n}/B^n, n = ceil(p/GMP_NUMB_BITS),
      39     A = 2^(1+as)*{a, an}/B^an, as is 0 or 1, an = ceil(ap/GMP_NUMB_BITS),
      40     where B = 2^GMP_NUMB_BITS.
      41  
      42     We have 1 <= A < 4 and 1/2 <= X < 1.
      43  
      44     The error in the approximate result with respect to the true
      45     value 1/sqrt(A) is bounded by 1 ulp(X), i.e., 2^{-p} since 1/2 <= X < 1.
      46  
      47     Note: x and a are left-aligned, i.e., the most significant bit of
      48     a[an-1] is set, and so is the most significant bit of the output x[n-1].
      49  
      50     If p is not a multiple of GMP_NUMB_BITS, the extra low bits of the input
      51     A are taken into account to compute the approximation of 1/sqrt(A), but
      52     whether or not they are zero, the error between X and 1/sqrt(A) is bounded
      53     by 1 ulp(X) [in precision p].
      54     The extra low bits of the output X (if p is not a multiple of GMP_NUMB_BITS)
      55     are set to 0.
      56  
      57     Assumptions:
      58     (1) A should be normalized, i.e., the most significant bit of a[an-1]
      59         should be 1. If as=0, we have 1 <= A < 2; if as=1, we have 2 <= A < 4.
      60     (2) p >= 12
      61     (3) {a, an} and {x, n} should not overlap
      62     (4) GMP_NUMB_BITS >= 12 and is even
      63  
      64     Note: this routine is much more efficient when ap is small compared to p,
      65     including the case where ap <= GMP_NUMB_BITS, thus it can be used to
      66     implement an efficient mpfr_rec_sqrt_ui function.
      67  
      68     References:
      69     [1] Modern Computer Algebra, Richard Brent and Paul Zimmermann,
      70     https://members.loria.fr/PZimmermann/mca/pub226.html
      71  */
      72  static void
      73  mpfr_mpn_rec_sqrt (mpfr_limb_ptr x, mpfr_prec_t p,
      74                     mpfr_limb_srcptr a, mpfr_prec_t ap, int as)
      75  
      76  {
      77    /* the following T1 and T2 are bipartite tables giving initial
      78       approximation for the inverse square root, with 13-bit input split in
      79       5+4+4, and 11-bit output. More precisely, if 2048 <= i < 8192,
      80       with i = a*2^8 + b*2^4 + c, we use for approximation of
      81       2048/sqrt(i/2048) the value x = T1[16*(a-8)+b] + T2[16*(a-8)+c].
      82       The largest error is obtained for i = 2054, where x = 2044,
      83       and 2048/sqrt(i/2048) = 2045.006576...
      84    */
      85    static short int T1[384] = {
      86  2040, 2033, 2025, 2017, 2009, 2002, 1994, 1987, 1980, 1972, 1965, 1958, 1951,
      87  1944, 1938, 1931, /* a=8 */
      88  1925, 1918, 1912, 1905, 1899, 1892, 1886, 1880, 1874, 1867, 1861, 1855, 1849,
      89  1844, 1838, 1832, /* a=9 */
      90  1827, 1821, 1815, 1810, 1804, 1799, 1793, 1788, 1783, 1777, 1772, 1767, 1762,
      91  1757, 1752, 1747, /* a=10 */
      92  1742, 1737, 1733, 1728, 1723, 1718, 1713, 1709, 1704, 1699, 1695, 1690, 1686,
      93  1681, 1677, 1673, /* a=11 */
      94  1669, 1664, 1660, 1656, 1652, 1647, 1643, 1639, 1635, 1631, 1627, 1623, 1619,
      95  1615, 1611, 1607, /* a=12 */
      96  1603, 1600, 1596, 1592, 1588, 1585, 1581, 1577, 1574, 1570, 1566, 1563, 1559,
      97  1556, 1552, 1549, /* a=13 */
      98  1545, 1542, 1538, 1535, 1532, 1528, 1525, 1522, 1518, 1515, 1512, 1509, 1505,
      99  1502, 1499, 1496, /* a=14 */
     100  1493, 1490, 1487, 1484, 1481, 1478, 1475, 1472, 1469, 1466, 1463, 1460, 1457,
     101  1454, 1451, 1449, /* a=15 */
     102  1446, 1443, 1440, 1438, 1435, 1432, 1429, 1427, 1424, 1421, 1419, 1416, 1413,
     103  1411, 1408, 1405, /* a=16 */
     104  1403, 1400, 1398, 1395, 1393, 1390, 1388, 1385, 1383, 1380, 1378, 1375, 1373,
     105  1371, 1368, 1366, /* a=17 */
     106  1363, 1360, 1358, 1356, 1353, 1351, 1349, 1346, 1344, 1342, 1340, 1337, 1335,
     107  1333, 1331, 1329, /* a=18 */
     108  1327, 1325, 1323, 1321, 1319, 1316, 1314, 1312, 1310, 1308, 1306, 1304, 1302,
     109  1300, 1298, 1296, /* a=19 */
     110  1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1274, 1272, 1270,
     111  1268, 1266, 1265, /* a=20 */
     112  1263, 1261, 1259, 1257, 1255, 1253, 1251, 1250, 1248, 1246, 1244, 1242, 1241,
     113  1239, 1237, 1235, /* a=21 */
     114  1234, 1232, 1230, 1229, 1227, 1225, 1223, 1222, 1220, 1218, 1217, 1215, 1213,
     115  1212, 1210, 1208, /* a=22 */
     116  1206, 1204, 1203, 1201, 1199, 1198, 1196, 1195, 1193, 1191, 1190, 1188, 1187,
     117  1185, 1184, 1182, /* a=23 */
     118  1181, 1180, 1178, 1177, 1175, 1174, 1172, 1171, 1169, 1168, 1166, 1165, 1163,
     119  1162, 1160, 1159, /* a=24 */
     120  1157, 1156, 1154, 1153, 1151, 1150, 1149, 1147, 1146, 1144, 1143, 1142, 1140,
     121  1139, 1137, 1136, /* a=25 */
     122  1135, 1133, 1132, 1131, 1129, 1128, 1127, 1125, 1124, 1123, 1121, 1120, 1119,
     123  1117, 1116, 1115, /* a=26 */
     124  1114, 1113, 1111, 1110, 1109, 1108, 1106, 1105, 1104, 1103, 1101, 1100, 1099,
     125  1098, 1096, 1095, /* a=27 */
     126  1093, 1092, 1091, 1090, 1089, 1087, 1086, 1085, 1084, 1083, 1081, 1080, 1079,
     127  1078, 1077, 1076, /* a=28 */
     128  1075, 1073, 1072, 1071, 1070, 1069, 1068, 1067, 1065, 1064, 1063, 1062, 1061,
     129  1060, 1059, 1058, /* a=29 */
     130  1057, 1056, 1055, 1054, 1052, 1051, 1050, 1049, 1048, 1047, 1046, 1045, 1044,
     131  1043, 1042, 1041, /* a=30 */
     132  1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, 1031, 1030, 1029, 1028,
     133  1027, 1026, 1025 /* a=31 */
     134  };
     135    static unsigned char T2[384] = {
     136      7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1, 0, /* a=8 */
     137      6, 5, 5, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 0, 0, /* a=9 */
     138      5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 0, 0, /* a=10 */
     139      4, 4, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, /* a=11 */
     140      3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, /* a=12 */
     141      3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=13 */
     142      3, 3, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, /* a=14 */
     143      2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=15 */
     144      2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=16 */
     145      2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=17 */
     146      3, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, /* a=18 */
     147      2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=19 */
     148      1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, /* a=20 */
     149      1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=21 */
     150      1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=22 */
     151      2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* a=23 */
     152      1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=24 */
     153      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=25 */
     154      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* a=26 */
     155      1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=27 */
     156      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* a=28 */
     157      1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, /* a=29 */
     158      1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a=30 */
     159      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* a=31 */
     160  };
     161    mp_size_t n = LIMB_SIZE(p);   /* number of limbs of X */
     162    mp_size_t an = LIMB_SIZE(ap); /* number of limbs of A */
     163  
     164    /* A should be normalized */
     165    MPFR_ASSERTD((a[an - 1] & MPFR_LIMB_HIGHBIT) != 0);
     166    /* We should have enough bits in one limb and GMP_NUMB_BITS should be even.
     167       Since that does not depend on MPFR, we always check this. */
     168    MPFR_STAT_STATIC_ASSERT ((GMP_NUMB_BITS & 1) == 0);
     169    /* {a, an} and {x, n} should not overlap */
     170    MPFR_ASSERTD((a + an <= x) || (x + n <= a));
     171    MPFR_ASSERTD(p >= 11);
     172  
     173    if (MPFR_UNLIKELY(an > n)) /* we can cut the input to n limbs */
     174      {
     175        a += an - n;
     176        an = n;
     177      }
     178  
     179    if (p == 11) /* should happen only from recursive calls */
     180      {
     181        unsigned long i, ab, ac;
     182        unsigned int t;
     183  
     184        /* take the 12+as most significant bits of A */
     185  #if GMP_NUMB_BITS >= 16
     186        i = a[an - 1] >> (GMP_NUMB_BITS - (12 + as));
     187  #else
     188        MPFR_STAT_STATIC_ASSERT (GMP_NUMB_BITS == 8);
     189        {
     190          unsigned int a12 = a[an - 1] << 8;
     191          if (an >= 2)
     192            a12 |= a[an - 2];
     193          MPFR_ASSERTN(GMP_NUMB_BITS >= 4 + as);
     194          i = a12 >> (GMP_NUMB_BITS - (4 + as));
     195        }
     196  #endif
     197        /* if one wants faithful rounding for p=11, replace #if 0 by #if 1 */
     198        ab = i >> 4;
     199        ac = (ab & 0x3F0) | (i & 0x0F);
     200        t = T1[ab - 0x80] + T2[ac - 0x80];  /* fits on 16 bits */
     201  #if GMP_NUMB_BITS >= 16
     202        /* x has only one limb */
     203        x[0] = (mp_limb_t) t << (GMP_NUMB_BITS - p);
     204  #else
     205        MPFR_STAT_STATIC_ASSERT (GMP_NUMB_BITS == 8);
     206        MPFR_ASSERTD (1024 <= t && t <= 2047);
     207        x[1] = t >> 3; /* 128 <= x[1] <= 255 */
     208        x[0] = MPFR_LIMB_LSHIFT(t, 5);
     209  #endif
     210      }
     211    else /* p >= 12 */
     212      {
     213        mpfr_prec_t h, pl;
     214        mpfr_limb_ptr r, s, t, u;
     215        mp_size_t xn, rn, th, ln, tn, sn, ahn, un;
     216        mp_limb_t neg, cy, cu;
     217        MPFR_TMP_DECL(marker);
     218  
     219        /* compared to Algorithm 3.9 of [1], we have {a, an} = A/2 if as=0,
     220           and A/4 if as=1. */
     221  
     222        /* h = max(11, ceil((p+3)/2)) is the bitsize of the recursive call */
     223        h = (p < 18) ? 11 : (p >> 1) + 2;
     224  
     225        xn = LIMB_SIZE(h);       /* limb size of the recursive Xh */
     226        rn = LIMB_SIZE(2 * h);   /* a priori limb size of Xh^2 */
     227        ln = n - xn;             /* remaining limbs to be computed */
     228  
     229        /* Since |Xh - A^{-1/2}| <= 2^{-h}, then by multiplying by Xh + A^{-1/2}
     230           we get |Xh^2 - 1/A| <= 2^{-h+1}, thus |A*Xh^2 - 1| <= 2^{-h+3},
     231           thus the h-3 most significant bits of t should be zero,
     232           which is in fact h+1+as-3 because of the normalization of A.
     233           This corresponds to th=floor((h+1+as-3)/GMP_NUMB_BITS) limbs.
     234  
     235           More precisely we have |Xh^2 - 1/A| <= 2^{-h} * (Xh + A^{-1/2})
     236           <= 2^{-h} * (2 A^{-1/2} + 2^{-h}) <= 2.001 * 2^{-h} * A^{-1/2}
     237           since A < 4 and h >= 11, thus
     238           |A*Xh^2 - 1| <= 2.001 * 2^{-h} * A^{1/2} <= 1.001 * 2^{2-h}.
     239           This is sufficient to prove that the upper limb of {t,tn} below is
     240           less that 0.501 * 2^GMP_NUMB_BITS, thus cu = 0 below.
     241        */
     242        th = (h + 1 + as - 3) >> MPFR_LOG2_GMP_NUMB_BITS;
     243        tn = LIMB_SIZE(2 * h + 1 + as);
     244  
     245        /* we need h+1+as bits of a */
     246        ahn = LIMB_SIZE(h + 1 + as); /* number of high limbs of A
     247                                        needed for the recursive call */
     248        if (MPFR_UNLIKELY(ahn > an))
     249          ahn = an;
     250        mpfr_mpn_rec_sqrt (x + ln, h, a + an - ahn, ahn * GMP_NUMB_BITS, as);
     251        /* the most h significant bits of X are set, X has ceil(h/GMP_NUMB_BITS)
     252           limbs, the low (-h) % GMP_NUMB_BITS bits are zero */
     253  
     254        /* compared to Algorithm 3.9 of [1], we have {x+ln,xn} = X_h */
     255  
     256        MPFR_TMP_MARK (marker);
     257        /* first step: square X in r, result is exact */
     258        un = xn + (tn - th);
     259        /* We use the same temporary buffer to store r and u: r needs 2*xn
     260           limbs where u needs xn+(tn-th) limbs. Since tn can store at least
     261           2h bits, and th at most h bits, then tn-th can store at least h bits,
     262           thus tn - th >= xn, and reserving the space for u is enough. */
     263        MPFR_ASSERTD(2 * xn <= un);
     264        u = r = MPFR_TMP_LIMBS_ALLOC (un);
     265        if (2 * h <= GMP_NUMB_BITS) /* xn=rn=1, and since p <= 2h-3, n=1,
     266                                       thus ln = 0 */
     267          {
     268            MPFR_ASSERTD(ln == 0);
     269            cy = x[0] >> (GMP_NUMB_BITS >> 1);
     270            r ++;
     271            r[0] = cy * cy;
     272          }
     273        else if (xn == 1) /* xn=1, rn=2 */
     274          umul_ppmm(r[1], r[0], x[ln], x[ln]);
     275        else
     276          {
     277            mpn_sqr (r, x + ln, xn);
     278            /* we have {r, 2*xn} = X_h^2 */
     279            if (rn < 2 * xn)
     280              r ++;
     281          }
     282        /* now the 2h most significant bits of {r, rn} contains X^2, r has rn
     283           limbs, and the low (-2h) % GMP_NUMB_BITS bits are zero */
     284  
     285        /* Second step: s <- A * (r^2), and truncate the low ap bits,
     286           i.e., at weight 2^{-2h} (s is aligned to the low significant bits)
     287         */
     288        sn = an + rn;
     289        s = MPFR_TMP_LIMBS_ALLOC (sn);
     290        if (rn == 1) /* rn=1 implies n=1, since rn*GMP_NUMB_BITS >= 2h,
     291                             and 2h >= p+3 */
     292          {
     293            /* necessarily p <= GMP_NUMB_BITS-3: we can ignore the two low
     294               bits from A */
     295            /* since n=1, and we ensured an <= n, we also have an=1 */
     296            MPFR_ASSERTD(an == 1);
     297            umul_ppmm (s[1], s[0], r[0], a[0]);
     298          }
     299        else
     300          {
     301            /* we have p <= n * GMP_NUMB_BITS
     302               2h <= rn * GMP_NUMB_BITS with p+3 <= 2h <= p+4
     303               thus n <= rn <= n + 1 */
     304            MPFR_ASSERTD(rn <= n + 1);
     305            /* since we ensured an <= n, we have an <= rn */
     306            MPFR_ASSERTD(an <= rn);
     307            mpn_mul (s, r, rn, a, an);
     308            /* s should be near B^sn/2^(1+as), thus s[sn-1] is either
     309               100000... or 011111... if as=0, or
     310               010000... or 001111... if as=1.
     311               We ignore the bits of s after the first 2h+1+as ones.
     312               We have {s, rn+an} = A*X_h^2/2 if as=0, A*X_h^2/4 if as=1. */
     313          }
     314  
     315        /* We ignore the bits of s after the first 2h+1+as ones: s has rn + an
     316           limbs, where rn = LIMBS(2h), an=LIMBS(a), and tn = LIMBS(2h+1+as). */
     317        t = s + sn - tn; /* pointer to low limb of the high part of t */
     318        /* the upper h-3 bits of 1-t should be zero,
     319           where 1 corresponds to the most significant bit of t[tn-1] if as=0,
     320           and to the 2nd most significant bit of t[tn-1] if as=1 */
     321  
     322        /* compute t <- 1 - t, which is B^tn - {t, tn+1},
     323           with rounding toward -Inf, i.e., rounding the input t toward +Inf.
     324           We could only modify the low tn - th limbs from t, but it gives only
     325           a small speedup, and would make the code more complex.
     326        */
     327        neg = t[tn - 1] & (MPFR_LIMB_HIGHBIT >> as);
     328        if (neg == 0) /* Ax^2 < 1: we have t = th + eps, where 0 <= eps < ulp(th)
     329                         is the part truncated above, thus 1 - t rounded to -Inf
     330                         is 1 - th - ulp(th) */
     331          {
     332            /* since the 1+as most significant bits of t are zero, set them
     333               to 1 before the one-complement */
     334            t[tn - 1] |= MPFR_LIMB_HIGHBIT | (MPFR_LIMB_HIGHBIT >> as);
     335            MPFR_COM_N (t, t, tn);
     336            /* we should add 1 here to get 1-th complement, and subtract 1 for
     337               -ulp(th), thus we do nothing */
     338          }
     339        else /* negative case: we want 1 - t rounded toward -Inf, i.e.,
     340                th + eps rounded toward +Inf, which is th + ulp(th):
     341                we discard the bit corresponding to 1,
     342                and we add 1 to the least significant bit of t */
     343          {
     344            t[tn - 1] ^= neg;
     345            mpn_add_1 (t, t, tn, 1);
     346          }
     347        tn -= th; /* we know at least th = floor((h+1+as-3)/GMP_NUMB_LIMBS) of
     348                     the high limbs of {t, tn} are zero */
     349  
     350        /* tn = rn - th, where rn * GMP_NUMB_BITS >= 2*h and
     351           th * GMP_NUMB_BITS <= h+1+as-3, thus tn > 0 */
     352        MPFR_ASSERTD(tn > 0);
     353  
     354        /* u <- x * t, where {t, tn} contains at least h+3 bits,
     355           and {x, xn} contains h bits, thus tn >= xn */
     356        MPFR_ASSERTD(tn >= xn);
     357        if (tn == 1) /* necessarily xn=1 */
     358          umul_ppmm (u[1], u[0], t[0], x[ln]);
     359        else
     360          mpn_mul (u, t, tn, x + ln, xn);
     361  
     362        /* we have {u, tn+xn} = T_l X_h/2 if as=0, T_l X_h/4 if as=1 */
     363  
     364        /* we have already discarded the upper th high limbs of t, thus we only
     365           have to consider the upper n - th limbs of u */
     366        un = n - th; /* un cannot be zero, since p <= n*GMP_NUMB_BITS,
     367                        h = ceil((p+3)/2) <= (p+4)/2,
     368                        th*GMP_NUMB_BITS <= h-1 <= p/2+1,
     369                        thus (n-th)*GMP_NUMB_BITS >= p/2-1.
     370                     */
     371        MPFR_ASSERTD(un > 0);
     372        u += (tn + xn) - un; /* xn + tn - un = xn + (original_tn - th) - (n - th)
     373                                             = xn + original_tn - n
     374                                = LIMBS(h) + LIMBS(2h+1+as) - LIMBS(p) > 0
     375                                since 2h >= p+3 */
     376        MPFR_ASSERTD(tn + xn > un); /* will allow to access u[-1] below */
     377  
     378        /* In case as=0, u contains |x*(1-Ax^2)/2|, which is exactly what we
     379           need to add or subtract.
     380           In case as=1, u contains |x*(1-Ax^2)/4|, thus we need to multiply
     381           u by 2. */
     382  
     383        if (as == 1)
     384          /* shift on un+1 limbs to get most significant bit of u[-1] into
     385             least significant bit of u[0] */
     386          mpn_lshift (u - 1, u - 1, un + 1, 1);
     387  
     388        /* now {u,un} represents U / 2 from Algorithm 3.9 */
     389  
     390        pl = n * GMP_NUMB_BITS - p;       /* low bits from x */
     391        /* We want that the low pl bits are zero after rounding to nearest,
     392           thus we round u to nearest at bit pl-1 of u[0] */
     393        if (pl > 0)
     394          {
     395            cu = mpn_add_1 (u, u, un,
     396                            u[0] & MPFR_LIMB_LSHIFT(MPFR_LIMB_ONE, pl - 1));
     397            /* mask bits 0..pl-1 of u[0] */
     398            u[0] &= MPFR_LIMB(~MPFR_LIMB_MASK(pl));
     399          }
     400        else /* round bit is in u[-1] */
     401          cu = mpn_add_1 (u, u, un, u[-1] >> (GMP_NUMB_BITS - 1));
     402        MPFR_ASSERTN(cu == 0);
     403  
     404        /* We already have filled {x + ln, xn = n - ln}, and we want to add or
     405           subtract {u, un} at position x.
     406           un = n - th, where th contains <= h+1+as-3<=h-1 bits
     407           ln = n - xn, where xn contains >= h bits
     408           thus un > ln.
     409           Warning: ln might be zero.
     410        */
     411        MPFR_ASSERTD(un > ln);
     412        /* we can have un = ln + 2, for example with GMP_NUMB_BITS=32 and
     413           p=62, as=0, then h=33, n=2, th=0, xn=2, thus un=2 and ln=0. */
     414        MPFR_ASSERTD(un == ln + 1 || un == ln + 2);
     415        /* the high un-ln limbs of u will overlap the low part of {x+ln,xn},
     416           we need to add or subtract the overlapping part {u + ln, un - ln} */
     417        /* Warning! th may be 0, in which case the mpn_add_1 and mpn_sub_1
     418           below (with size = th) mustn't be used. */
     419        if (neg == 0)
     420          {
     421            if (ln > 0)
     422              MPN_COPY (x, u, ln);
     423            cy = mpn_add (x + ln, x + ln, xn, u + ln, un - ln);
     424            /* cy is the carry at x + (ln + xn) = x + n */
     425          }
     426        else /* negative case */
     427          {
     428            /* subtract {u+ln, un-ln} from {x+ln,un} */
     429            cy = mpn_sub (x + ln, x + ln, xn, u + ln, un - ln);
     430            /* cy is the borrow at x + (ln + xn) = x + n */
     431  
     432            /* cy cannot be non-zero, since the most significant bit of Xh is 1,
     433               and the correction is bounded by 2^{-h+3} */
     434            MPFR_ASSERTD(cy == 0);
     435            if (ln > 0)
     436              {
     437                MPFR_COM_N (x, u, ln);
     438                /* we must add one for the 2-complement ... */
     439                cy = mpn_add_1 (x, x, n, MPFR_LIMB_ONE);
     440                /* ... and subtract 1 at x[ln], where n = ln + xn */
     441                cy -= mpn_sub_1 (x + ln, x + ln, xn, MPFR_LIMB_ONE);
     442              }
     443          }
     444  
     445        /* cy can be 1 when A=1, i.e., {a, n} = B^n. In that case we should
     446           have X = B^n, and setting X to 1-2^{-p} satisfies the error bound
     447           of 1 ulp. */
     448        if (MPFR_UNLIKELY(cy != 0))
     449          {
     450            cy -= mpn_sub_1 (x, x, n, MPFR_LIMB_ONE << pl);
     451            MPFR_ASSERTD(cy == 0);
     452          }
     453  
     454        MPFR_TMP_FREE (marker);
     455      }
     456  }
     457  
     458  int
     459  mpfr_rec_sqrt (mpfr_ptr r, mpfr_srcptr u, mpfr_rnd_t rnd_mode)
     460  {
     461    mpfr_prec_t rp, up, wp;
     462    mp_size_t rn, wn;
     463    int s, cy, inex;
     464    mpfr_limb_ptr x;
     465    MPFR_TMP_DECL(marker);
     466    MPFR_ZIV_DECL (loop);
     467  
     468    MPFR_LOG_FUNC
     469      (("x[%Pd]=%.*Rg rnd=%d", mpfr_get_prec (u), mpfr_log_prec, u, rnd_mode),
     470       ("y[%Pd]=%.*Rg inexact=%d", mpfr_get_prec (r), mpfr_log_prec, r, inex));
     471  
     472    /* special values */
     473    if (MPFR_UNLIKELY(MPFR_IS_SINGULAR(u)))
     474      {
     475        if (MPFR_IS_NAN(u))
     476          {
     477            MPFR_SET_NAN(r);
     478            MPFR_RET_NAN;
     479          }
     480        else if (MPFR_IS_ZERO(u)) /* 1/sqrt(+0) = 1/sqrt(-0) = +Inf */
     481          {
     482            /* +0 or -0 */
     483            MPFR_SET_INF(r);
     484            MPFR_SET_POS(r);
     485            MPFR_SET_DIVBY0 ();
     486            MPFR_RET(0); /* Inf is exact */
     487          }
     488        else
     489          {
     490            MPFR_ASSERTD(MPFR_IS_INF(u));
     491            /* 1/sqrt(-Inf) = NAN */
     492            if (MPFR_IS_NEG(u))
     493              {
     494                MPFR_SET_NAN(r);
     495                MPFR_RET_NAN;
     496              }
     497            /* 1/sqrt(+Inf) = +0 */
     498            MPFR_SET_POS(r);
     499            MPFR_SET_ZERO(r);
     500            MPFR_RET(0);
     501          }
     502      }
     503  
     504    /* if u < 0, 1/sqrt(u) is NaN */
     505    if (MPFR_UNLIKELY(MPFR_IS_NEG(u)))
     506      {
     507        MPFR_SET_NAN(r);
     508        MPFR_RET_NAN;
     509      }
     510  
     511    MPFR_SET_POS(r);
     512  
     513    rp = MPFR_PREC(r); /* output precision */
     514    up = MPFR_PREC(u); /* input precision */
     515    wp = rp + 11;      /* initial working precision */
     516  
     517    /* Let u = U*2^e, where e = EXP(u), and 1/2 <= U < 1.
     518       If e is even, we compute an approximation of X of (4U)^{-1/2},
     519       and the result is X*2^(-(e-2)/2) [case s=1].
     520       If e is odd, we compute an approximation of X of (2U)^{-1/2},
     521       and the result is X*2^(-(e-1)/2) [case s=0]. */
     522  
     523    /* parity of the exponent of u */
     524    s = 1 - ((mpfr_uexp_t) MPFR_GET_EXP (u) & 1);
     525  
     526    rn = LIMB_SIZE(rp);
     527  
     528    /* for the first iteration, if rp + 11 fits into rn limbs, we round up
     529       up to a full limb to maximize the chance of rounding, while avoiding
     530       to allocate extra space */
     531    wp = rp + 11;
     532    if (wp < rn * GMP_NUMB_BITS)
     533      wp = rn * GMP_NUMB_BITS;
     534    MPFR_ZIV_INIT (loop, wp);
     535    for (;;)
     536      {
     537        MPFR_TMP_MARK (marker);
     538        wn = LIMB_SIZE(wp);
     539        if (r == u || wn > rn) /* out of place, i.e., we cannot write to r */
     540          x = MPFR_TMP_LIMBS_ALLOC (wn);
     541        else
     542          x = MPFR_MANT(r);
     543        mpfr_mpn_rec_sqrt (x, wp, MPFR_MANT(u), up, s);
     544        /* If the input was not truncated, the error is at most one ulp;
     545           if the input was truncated, the error is at most two ulps
     546           (see algorithms.tex). */
     547        if (MPFR_LIKELY (mpfr_round_p (x, wn, wp - (wp < up),
     548                                       rp + (rnd_mode == MPFR_RNDN))))
     549          break;
     550  
     551        /* We detect only now the exact case where u=2^(2e), to avoid
     552           slowing down the average case. This can happen only when the
     553           mantissa is exactly 1/2 and the exponent is odd. */
     554        if (s == 0 && mpfr_cmp_ui_2exp (u, 1, MPFR_EXP(u) - 1) == 0)
     555          {
     556            mpfr_prec_t pl = wn * GMP_NUMB_BITS - wp;
     557  
     558            /* we should have x=111...111 */
     559            mpn_add_1 (x, x, wn, MPFR_LIMB_ONE << pl);
     560            x[wn - 1] = MPFR_LIMB_HIGHBIT;
     561            s += 2;
     562            break; /* go through */
     563          }
     564        MPFR_TMP_FREE(marker);
     565  
     566        MPFR_ZIV_NEXT (loop, wp);
     567      }
     568    MPFR_ZIV_FREE (loop);
     569    cy = mpfr_round_raw (MPFR_MANT(r), x, wp, 0, rp, rnd_mode, &inex);
     570    MPFR_EXP(r) = - (MPFR_EXP(u) - 1 - s) / 2;
     571    if (MPFR_UNLIKELY(cy != 0))
     572      {
     573        MPFR_EXP(r) ++;
     574        MPFR_MANT(r)[rn - 1] = MPFR_LIMB_HIGHBIT;
     575      }
     576    MPFR_TMP_FREE(marker);
     577    return mpfr_check_range (r, inex, rnd_mode);
     578  }