1  /* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and
       2     store the truncated integer part at rootp and the remainder at remp.
       3  
       4     Contributed by Paul Zimmermann (algorithm) and
       5     Paul Zimmermann and Torbjorn Granlund (implementation).
       6     Marco Bodrato wrote logbased_root to seed the loop.
       7  
       8     THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES.  IT'S
       9     ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT'S ALMOST
      10     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
      11  
      12  Copyright 2002, 2005, 2009-2012, 2015 Free Software Foundation, Inc.
      13  
      14  This file is part of the GNU MP Library.
      15  
      16  The GNU MP Library is free software; you can redistribute it and/or modify
      17  it under the terms of either:
      18  
      19    * the GNU Lesser General Public License as published by the Free
      20      Software Foundation; either version 3 of the License, or (at your
      21      option) any later version.
      22  
      23  or
      24  
      25    * the GNU General Public License as published by the Free Software
      26      Foundation; either version 2 of the License, or (at your option) any
      27      later version.
      28  
      29  or both in parallel, as here.
      30  
      31  The GNU MP Library is distributed in the hope that it will be useful, but
      32  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      33  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      34  for more details.
      35  
      36  You should have received copies of the GNU General Public License and the
      37  GNU Lesser General Public License along with the GNU MP Library.  If not,
      38  see https://www.gnu.org/licenses/.  */
      39  
      40  /* FIXME:
      41       This implementation is not optimal when remp == NULL, since the complexity
      42       is M(n), whereas it should be M(n/k) on average.
      43  */
      44  
      45  #include <stdio.h>		/* for NULL */
      46  
      47  #include "gmp-impl.h"
      48  #include "longlong.h"
      49  
      50  static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t,
      51  				       mp_limb_t, int);
      52  
      53  #define MPN_RSHIFT(rp,up,un,cnt) \
      54    do {									\
      55      if ((cnt) != 0)							\
      56        mpn_rshift (rp, up, un, cnt);					\
      57      else								\
      58        {									\
      59  	MPN_COPY_INCR (rp, up, un);					\
      60        }									\
      61    } while (0)
      62  
      63  #define MPN_LSHIFT(cy,rp,up,un,cnt) \
      64    do {									\
      65      if ((cnt) != 0)							\
      66        cy = mpn_lshift (rp, up, un, cnt);				\
      67      else								\
      68        {									\
      69  	MPN_COPY_DECR (rp, up, un);					\
      70  	cy = 0;								\
      71        }									\
      72    } while (0)
      73  
      74  
      75  /* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero.
      76     If remp <> NULL, put in {remp, un} the remainder.
      77     Return the size (in limbs) of the remainder if remp <> NULL,
      78  	  or a non-zero value iff the remainder is non-zero when remp = NULL.
      79     Assumes:
      80     (a) up[un-1] is not zero
      81     (b) rootp has at least space for ceil(un/k) limbs
      82     (c) remp has at least space for un limbs (in case remp <> NULL)
      83     (d) the operands do not overlap.
      84  
      85     The auxiliary memory usage is 3*un+2 if remp = NULL,
      86     and 2*un+2 if remp <> NULL.  FIXME: This is an incorrect comment.
      87  */
      88  mp_size_t
      89  mpn_rootrem (mp_ptr rootp, mp_ptr remp,
      90  	     mp_srcptr up, mp_size_t un, mp_limb_t k)
      91  {
      92    ASSERT (un > 0);
      93    ASSERT (up[un - 1] != 0);
      94    ASSERT (k > 1);
      95  
      96    if (UNLIKELY (k == 2))
      97      return mpn_sqrtrem (rootp, remp, up, un);
      98    /* (un-1)/k > 2 <=> un > 3k <=> (un + 2)/3 > k */
      99    if (remp == NULL && (un + 2) / 3 > k)
     100      /* Pad {up,un} with k zero limbs.  This will produce an approximate root
     101         with one more limb, allowing us to compute the exact integral result. */
     102      {
     103        mp_ptr sp, wp;
     104        mp_size_t rn, sn, wn;
     105        TMP_DECL;
     106        TMP_MARK;
     107        wn = un + k;
     108        sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */
     109        TMP_ALLOC_LIMBS_2 (wp, wn, /* will contain the padded input */
     110  			 sp, sn); /* approximate root of padded input */
     111        MPN_COPY (wp + k, up, un);
     112        MPN_FILL (wp, k, 0);
     113        rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1);
     114        /* The approximate root S = {sp,sn} is either the correct root of
     115  	 {sp,sn}, or 1 too large.  Thus unless the least significant limb of
     116  	 S is 0 or 1, we can deduce the root of {up,un} is S truncated by one
     117  	 limb.  (In case sp[0]=1, we can deduce the root, but not decide
     118  	 whether it is exact or not.) */
     119        MPN_COPY (rootp, sp + 1, sn - 1);
     120        TMP_FREE;
     121        return rn;
     122      }
     123    else
     124      {
     125        return mpn_rootrem_internal (rootp, remp, up, un, k, 0);
     126      }
     127  }
     128  
     129  #define LOGROOT_USED_BITS 8
     130  #define LOGROOT_NEEDS_TWO_CORRECTIONS 1
     131  #define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS)
     132  /* Puts in *rootp some bits of the k^nt root of the number
     133     2^bitn * 1.op ; where op represents the "fractional" bits.
     134  
     135     The returned value is the number of bits of the root minus one;
     136     i.e. an approximation of the root will be
     137     (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1).
     138  
     139     Currently, only LOGROOT_USED_BITS bits of op are used (the implicit
     140     one is not counted).
     141   */
     142  static unsigned
     143  logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k)
     144  {
     145    /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */
     146    static const
     147    unsigned char vlog[] = {1,   2,   4,   5,   7,   8,   9,  11,  12,  14,  15,  16,  18,  19,  21,  22,
     148  			 23,  25,  26,  27,  29,  30,  31,  33,  34,  35,  37,  38,  39,  40,  42,  43,
     149  			 44,  46,  47,  48,  49,  51,  52,  53,  54,  56,  57,  58,  59,  61,  62,  63,
     150  			 64,  65,  67,  68,  69,  70,  71,  73,  74,  75,  76,  77,  78,  80,  81,  82,
     151  			 83,  84,  85,  87,  88,  89,  90,  91,  92,  93,  94,  96,  97,  98,  99, 100,
     152  			101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
     153  			118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134,
     154  			135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
     155  			150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164,
     156  			165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179,
     157  			180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193,
     158  			194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206,
     159  			207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219,
     160  			220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232,
     161  			232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244,
     162  			245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255};
     163  
     164    /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */
     165    static const
     166    unsigned char vexp[] = {0,   1,   2,   2,   3,   4,   4,   5,   6,   7,   7,   8,   9,   9,  10,  11,
     167  			 12,  12,  13,  14,  14,  15,  16,  17,  17,  18,  19,  20,  20,  21,  22,  23,
     168  			 23,  24,  25,  26,  26,  27,  28,  29,  30,  30,  31,  32,  33,  33,  34,  35,
     169  			 36,  37,  37,  38,  39,  40,  41,  41,  42,  43,  44,  45,  45,  46,  47,  48,
     170  			 49,  50,  50,  51,  52,  53,  54,  55,  55,  56,  57,  58,  59,  60,  61,  61,
     171  			 62,  63,  64,  65,  66,  67,  67,  68,  69,  70,  71,  72,  73,  74,  75,  75,
     172  			 76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  86,  87,  88,  89,  90,
     173  			 91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106,
     174  			107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122,
     175  			123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
     176  			139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156,
     177  			157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174,
     178  			175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193,
     179  			194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213,
     180  			214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234,
     181  			235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255};
     182    mp_bitcnt_t retval;
     183  
     184    if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS))
     185      {
     186        /* In the unlikely case, we use two divisions and a modulo. */
     187        retval = bitn / k;
     188        bitn %= k;
     189        bitn = (bitn << LOGROOT_USED_BITS |
     190  	      vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
     191      }
     192    else
     193      {
     194        bitn = (bitn << LOGROOT_USED_BITS |
     195  	      vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k;
     196        retval = bitn >> LOGROOT_USED_BITS;
     197        bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1;
     198      }
     199    ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS);
     200    *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS)
     201      | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS;
     202    return retval;
     203  }
     204  
     205  /* if approx is non-zero, does not compute the final remainder */
     206  static mp_size_t
     207  mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un,
     208  		      mp_limb_t k, int approx)
     209  {
     210    mp_ptr qp, rp, sp, wp, scratch;
     211    mp_size_t qn, rn, sn, wn, nl, bn;
     212    mp_limb_t save, save2, cy, uh;
     213    mp_bitcnt_t unb; /* number of significant bits of {up,un} */
     214    mp_bitcnt_t xnb; /* number of significant bits of the result */
     215    mp_bitcnt_t b, kk;
     216    mp_bitcnt_t sizes[GMP_NUMB_BITS + 1];
     217    int ni;
     218    int perf_pow;
     219    unsigned ulz, snb, c, logk;
     220    TMP_DECL;
     221  
     222    /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */
     223    uh = up[un - 1];
     224    count_leading_zeros (ulz, uh);
     225    ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */
     226    unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz;
     227    /* unb is the (truncated) logarithm of the input U in base 2*/
     228  
     229    if (unb < k) /* root is 1 */
     230      {
     231        rootp[0] = 1;
     232        if (remp == NULL)
     233  	un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */
     234        else
     235  	{
     236  	  mpn_sub_1 (remp, up, un, CNST_LIMB (1));
     237  	  un -= (remp [un - 1] == 0);	/* There should be at most one zero limb,
     238  				   if we demand u to be normalized  */
     239  	}
     240        return un;
     241      }
     242    /* if (unb - k < k/2 + k/16) // root is 2 */
     243  
     244    if (ulz == GMP_NUMB_BITS)
     245      uh = up[un - 2];
     246    else
     247      uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz);
     248    ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1);
     249  
     250    xnb = logbased_root (rootp, uh, unb, k);
     251    snb = LOGROOT_RETURNED_BITS - 1;
     252    /* xnb+1 is the number of bits of the root R */
     253    /* snb+1 is the number of bits of the current approximation S */
     254  
     255    kk = k * xnb;		/* number of truncated bits in the input */
     256  
     257    /* FIXME: Should we skip the next two loops when xnb <= snb ? */
     258    for (uh = (k - 1) / 2, logk = 3; (uh >>= 1) != 0; ++logk )
     259      ;
     260    /* logk = ceil(log(k)/log(2)) + 1 */
     261  
     262    /* xnb is the number of remaining bits to determine in the kth root */
     263    for (ni = 0; (sizes[ni] = xnb) > snb; ++ni)
     264      {
     265        /* invariant: here we want xnb+1 total bits for the kth root */
     266  
     267        /* if c is the new value of xnb, this means that we'll go from a
     268  	 root of c+1 bits (say s') to a root of xnb+1 bits.
     269  	 It is proved in the book "Modern Computer Arithmetic" by Brent
     270  	 and Zimmermann, Chapter 1, that
     271  	 if s' >= k*beta, then at most one correction is necessary.
     272  	 Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that
     273  	 c >= ceil((xnb + log2(k))/2). */
     274        if (xnb > logk)
     275  	xnb = (xnb + logk) / 2;
     276        else
     277  	--xnb;	/* add just one bit at a time */
     278      }
     279  
     280    *rootp >>= snb - xnb;
     281    kk -= xnb;
     282  
     283    ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1);
     284    /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with
     285       sizes[i] <= 2 * sizes[i+1].
     286       Newton iteration will first compute sizes[ni-1] extra bits,
     287       then sizes[ni-2], ..., then sizes[0] = b. */
     288  
     289    TMP_MARK;
     290    /* qp and wp need enough space to store S'^k where S' is an approximate
     291       root. Since S' can be as large as S+2, the worst case is when S=2 and
     292       S'=4. But then since we know the number of bits of S in advance, S'
     293       can only be 3 at most. Similarly for S=4, then S' can be 6 at most.
     294       So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k
     295       fits in un limbs, the number of extra limbs needed is bounded by
     296       ceil(k*log2(3/2)/GMP_NUMB_BITS). */
     297    /* THINK: with the use of logbased_root, maybe the constant is
     298       258/256 instead of 3/2 ? log2(258/256) < 1/89 < 1/64 */
     299  #define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS)
     300    TMP_ALLOC_LIMBS_3 (scratch, un + 1, /* used by mpn_div_q */
     301  		     qp, un + EXTRA,  /* will contain quotient and remainder
     302  					 of R/(k*S^(k-1)), and S^k */
     303  		     wp, un + EXTRA); /* will contain S^(k-1), k*S^(k-1),
     304  					 and temporary for mpn_pow_1 */
     305  
     306    if (remp == NULL)
     307      rp = scratch;	/* will contain the remainder */
     308    else
     309      rp = remp;
     310    sp = rootp;
     311  
     312    sn = 1;		/* Initial approximation has one limb */
     313  
     314    for (b = xnb; ni != 0; --ni)
     315      {
     316        /* 1: loop invariant:
     317  	 {sp, sn} is the current approximation of the root, which has
     318  		  exactly 1 + sizes[ni] bits.
     319  	 {rp, rn} is the current remainder
     320  	 {wp, wn} = {sp, sn}^(k-1)
     321  	 kk = number of truncated bits of the input
     322        */
     323  
     324        /* Since each iteration treats b bits from the root and thus k*b bits
     325  	 from the input, and we already considered b bits from the input,
     326  	 we now have to take another (k-1)*b bits from the input. */
     327        kk -= (k - 1) * b; /* remaining input bits */
     328        /* {rp, rn} = floor({up, un} / 2^kk) */
     329        rn = un - kk / GMP_NUMB_BITS;
     330        MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS);
     331        rn -= rp[rn - 1] == 0;
     332  
     333        /* 9: current buffers: {sp,sn}, {rp,rn} */
     334  
     335        for (c = 0;; c++)
     336  	{
     337  	  /* Compute S^k in {qp,qn}. */
     338  	  /* W <- S^(k-1) for the next iteration,
     339  	     and S^k = W * S. */
     340  	  wn = mpn_pow_1 (wp, sp, sn, k - 1, qp);
     341  	  mpn_mul (qp, wp, wn, sp, sn);
     342  	  qn = wn + sn;
     343  	  qn -= qp[qn - 1] == 0;
     344  
     345  	  perf_pow = 1;
     346  	  /* if S^k > floor(U/2^kk), the root approximation was too large */
     347  	  if (qn > rn || (qn == rn && (perf_pow=mpn_cmp (qp, rp, rn)) > 0))
     348  	    MPN_DECR_U (sp, sn, 1);
     349  	  else
     350  	    break;
     351  	}
     352  
     353        /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */
     354  
     355        /* sometimes two corrections are needed with logbased_root*/
     356        ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS);
     357        ASSERT_ALWAYS (rn >= qn);
     358  
     359        b = sizes[ni - 1] - sizes[ni]; /* number of bits to compute in the
     360  				      next iteration */
     361        bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[], after shift */
     362  
     363        kk = kk - b;
     364        /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */
     365        nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS);
     366        /* nl  = 1 + floor((kk + b - 1) / GMP_NUMB_BITS)
     367  		 - floor(kk / GMP_NUMB_BITS)
     368  	     <= 1 + (kk + b - 1) / GMP_NUMB_BITS
     369  		  - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS
     370  	     = 2 + (b - 2) / GMP_NUMB_BITS
     371  	 thus since nl is an integer:
     372  	 nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */
     373  
     374        /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
     375  
     376        /* R = R - Q = floor(U/2^kk) - S^k */
     377        if (perf_pow != 0)
     378  	{
     379  	  mpn_sub (rp, rp, rn, qp, qn);
     380  	  MPN_NORMALIZE_NOT_ZERO (rp, rn);
     381  
     382  	  /* first multiply the remainder by 2^b */
     383  	  MPN_LSHIFT (cy, rp + bn, rp, rn, b % GMP_NUMB_BITS);
     384  	  rn = rn + bn;
     385  	  if (cy != 0)
     386  	    {
     387  	      rp[rn] = cy;
     388  	      rn++;
     389  	    }
     390  
     391  	  save = rp[bn];
     392  	  /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */
     393  	  if (nl - 1 > bn)
     394  	    save2 = rp[bn + 1];
     395  	}
     396        else
     397  	{
     398  	  rn = bn;
     399  	  save2 = save = 0;
     400  	}
     401        /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
     402  
     403        /* Now insert bits [kk,kk+b-1] from the input U */
     404        MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS);
     405        /* set to zero high bits of rp[bn] */
     406        rp[bn] &= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)) - 1;
     407        /* restore corresponding bits */
     408        rp[bn] |= save;
     409        if (nl - 1 > bn)
     410  	rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since
     411  			       they start by bit 0 in rp[0], so they use
     412  			       at most ceil(b/GMP_NUMB_BITS) limbs */
     413        /* FIXME: Should we normalise {rp,rn} here ?*/
     414  
     415        /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
     416  
     417        /* compute {wp, wn} = k * {sp, sn}^(k-1) */
     418        cy = mpn_mul_1 (wp, wp, wn, k);
     419        wp[wn] = cy;
     420        wn += cy != 0;
     421  
     422        /* 6: current buffers: {sp,sn}, {qp,qn} */
     423  
     424        /* multiply the root approximation by 2^b */
     425        MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS);
     426        sn = sn + b / GMP_NUMB_BITS;
     427        if (cy != 0)
     428  	{
     429  	  sp[sn] = cy;
     430  	  sn++;
     431  	}
     432  
     433        save = sp[b / GMP_NUMB_BITS];
     434  
     435        /* Number of limbs used by b bits, when least significant bit is
     436  	 aligned to least limb */
     437        bn = (b - 1) / GMP_NUMB_BITS + 1;
     438  
     439        /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */
     440  
     441        /* now divide {rp, rn} by {wp, wn} to get the low part of the root */
     442        if (UNLIKELY (rn < wn))
     443  	{
     444  	  MPN_FILL (sp, bn, 0);
     445  	}
     446        else
     447  	{
     448  	  qn = rn - wn; /* expected quotient size */
     449  	  if (qn <= bn) { /* Divide only if result is not too big. */
     450  	    mpn_div_q (qp, rp, rn, wp, wn, scratch);
     451  	    qn += qp[qn] != 0;
     452  	  }
     453  
     454        /* 5: current buffers: {sp,sn}, {qp,qn}.
     455  	 Note: {rp,rn} is not needed any more since we'll compute it from
     456  	 scratch at the end of the loop.
     457         */
     458  
     459        /* the quotient should be smaller than 2^b, since the previous
     460  	 approximation was correctly rounded toward zero */
     461  	  if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) &&
     462  			  qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS))))
     463  	    {
     464  	      for (qn = 1; qn < bn; ++qn)
     465  		sp[qn - 1] = GMP_NUMB_MAX;
     466  	      sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - 1 - ((b - 1) % GMP_NUMB_BITS));
     467  	    }
     468  	  else
     469  	    {
     470        /* 7: current buffers: {sp,sn}, {qp,qn} */
     471  
     472        /* Combine sB and q to form sB + q.  */
     473  	      MPN_COPY (sp, qp, qn);
     474  	      MPN_ZERO (sp + qn, bn - qn);
     475  	    }
     476  	}
     477        sp[b / GMP_NUMB_BITS] |= save;
     478  
     479        /* 8: current buffer: {sp,sn} */
     480  
     481      }
     482  
     483    /* otherwise we have rn > 0, thus the return value is ok */
     484    if (!approx || sp[0] <= CNST_LIMB (1))
     485      {
     486        for (c = 0;; c++)
     487  	{
     488  	  /* Compute S^k in {qp,qn}. */
     489  	  /* Last iteration: we don't need W anymore. */
     490  	  /* mpn_pow_1 requires that both qp and wp have enough
     491  	     space to store the result {sp,sn}^k + 1 limb */
     492  	  qn = mpn_pow_1 (qp, sp, sn, k, wp);
     493  
     494  	  perf_pow = 1;
     495  	  if (qn > un || (qn == un && (perf_pow=mpn_cmp (qp, up, un)) > 0))
     496  	    MPN_DECR_U (sp, sn, 1);
     497  	  else
     498  	    break;
     499  	};
     500  
     501        /* sometimes two corrections are needed with logbased_root*/
     502        ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS);
     503  
     504        rn = perf_pow != 0;
     505        if (rn != 0 && remp != NULL)
     506  	{
     507  	  mpn_sub (remp, up, un, qp, qn);
     508  	  rn = un;
     509  	  MPN_NORMALIZE_NOT_ZERO (remp, rn);
     510  	}
     511      }
     512  
     513    TMP_FREE;
     514    return rn;
     515  }