1  /* mpz_n_pow_ui -- mpn raised to ulong.
       2  
       3     THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
       4     CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
       5     FUTURE GNU MP RELEASES.
       6  
       7  Copyright 2001, 2002, 2005, 2012, 2015 Free Software Foundation, Inc.
       8  
       9  This file is part of the GNU MP Library.
      10  
      11  The GNU MP Library is free software; you can redistribute it and/or modify
      12  it under the terms of either:
      13  
      14    * the GNU Lesser General Public License as published by the Free
      15      Software Foundation; either version 3 of the License, or (at your
      16      option) any later version.
      17  
      18  or
      19  
      20    * the GNU General Public License as published by the Free Software
      21      Foundation; either version 2 of the License, or (at your option) any
      22      later version.
      23  
      24  or both in parallel, as here.
      25  
      26  The GNU MP Library is distributed in the hope that it will be useful, but
      27  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      28  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      29  for more details.
      30  
      31  You should have received copies of the GNU General Public License and the
      32  GNU Lesser General Public License along with the GNU MP Library.  If not,
      33  see https://www.gnu.org/licenses/.  */
      34  
      35  #include "gmp-impl.h"
      36  #include "longlong.h"
      37  
      38  
      39  /* Change this to "#define TRACE(x) x" for some traces. */
      40  #define TRACE(x)
      41  
      42  
      43  /* Use this to test the mul_2 code on a CPU without a native version of that
      44     routine.  */
      45  #if 0
      46  #define mpn_mul_2  refmpn_mul_2
      47  #define HAVE_NATIVE_mpn_mul_2  1
      48  #endif
      49  
      50  
      51  /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
      52     ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
      53     bsize==2 or >2, but separating that isn't easy because there's shared
      54     code both before and after (the size calculations and the powers of 2
      55     handling).
      56  
      57     Alternatives:
      58  
      59     It would work to just use the mpn_mul powering loop for 1 and 2 limb
      60     bases, but the current separate loop allows mul_1 and mul_2 to be done
      61     in-place, which might help cache locality a bit.  If mpn_mul was relaxed
      62     to allow source==dest when vn==1 or 2 then some pointer twiddling might
      63     let us get the same effect in one loop.
      64  
      65     The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
      66     form the biggest possible power of b that fits, only the biggest power of
      67     2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
      68     using mp_bases[b].big_base for small b, and thereby get better value
      69     from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
      70     so would be more complicated than it's worth, and could well end up being
      71     a slowdown for small e.  For big e on the other hand the algorithm is
      72     dominated by mpn_sqr so there wouldn't much of a saving.  The current
      73     code can be viewed as simply doing the first few steps of the powering in
      74     a single or double limb where possible.
      75  
      76     If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
      77     copy made of b is unnecessary.  We could just use the old alloc'ed block
      78     and free it at the end.  But arranging this seems like a lot more trouble
      79     than it's worth.  */
      80  
      81  
      82  /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
      83     a limb without overflowing.
      84     FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
      85  
      86  #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
      87  
      88  
      89  /* The following are for convenience, they update the size and check the
      90     alloc.  */
      91  
      92  #define MPN_SQR(dst, alloc, src, size)          \
      93    do {                                          \
      94      ASSERT (2*(size) <= (alloc));               \
      95      mpn_sqr (dst, src, size);                   \
      96      (size) *= 2;                                \
      97      (size) -= ((dst)[(size)-1] == 0);           \
      98    } while (0)
      99  
     100  #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
     101    do {                                                  \
     102      mp_limb_t  cy;                                      \
     103      ASSERT ((size) + (size2) <= (alloc));               \
     104      cy = mpn_mul (dst, src, size, src2, size2);         \
     105      (size) += (size2) - (cy == 0);                      \
     106    } while (0)
     107  
     108  #define MPN_MUL_2(ptr, size, alloc, mult)       \
     109    do {                                          \
     110      mp_limb_t  cy;                              \
     111      ASSERT ((size)+2 <= (alloc));               \
     112      cy = mpn_mul_2 (ptr, ptr, size, mult);      \
     113      (size)++;                                   \
     114      (ptr)[(size)] = cy;                         \
     115      (size) += (cy != 0);                        \
     116    } while (0)
     117  
     118  #define MPN_MUL_1(ptr, size, alloc, limb)       \
     119    do {                                          \
     120      mp_limb_t  cy;                              \
     121      ASSERT ((size)+1 <= (alloc));               \
     122      cy = mpn_mul_1 (ptr, ptr, size, limb);      \
     123      (ptr)[size] = cy;                           \
     124      (size) += (cy != 0);                        \
     125    } while (0)
     126  
     127  #define MPN_LSHIFT(ptr, size, alloc, shift)     \
     128    do {                                          \
     129      mp_limb_t  cy;                              \
     130      ASSERT ((size)+1 <= (alloc));               \
     131      cy = mpn_lshift (ptr, ptr, size, shift);    \
     132      (ptr)[size] = cy;                           \
     133      (size) += (cy != 0);                        \
     134    } while (0)
     135  
     136  #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
     137    do {                                                  \
     138      if ((shift) == 0)                                   \
     139        MPN_COPY (dst, src, size);                        \
     140      else                                                \
     141        {                                                 \
     142          mpn_rshift (dst, src, size, shift);             \
     143          (size) -= ((dst)[(size)-1] == 0);               \
     144        }                                                 \
     145    } while (0)
     146  
     147  
     148  /* ralloc and talloc are only wanted for ASSERTs, after the initial space
     149     allocations.  Avoid writing values to them in a normal build, to ensure
     150     the compiler lets them go dead.  gcc already figures this out itself
     151     actually.  */
     152  
     153  #define SWAP_RP_TP                                      \
     154    do {                                                  \
     155      MP_PTR_SWAP (rp, tp);                               \
     156      ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
     157    } while (0)
     158  
     159  
     160  void
     161  mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
     162  {
     163    mp_ptr         rp;
     164    mp_size_t      rtwos_limbs, ralloc, rsize;
     165    int            rneg, i, cnt, btwos, r_bp_overlap;
     166    mp_limb_t      blimb, rl;
     167    mp_bitcnt_t    rtwos_bits;
     168  #if HAVE_NATIVE_mpn_mul_2
     169    mp_limb_t      blimb_low, rl_high;
     170  #else
     171    mp_limb_t      b_twolimbs[2];
     172  #endif
     173    TMP_DECL;
     174  
     175    TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
     176  		 PTR(r), bp, bsize, e, e);
     177  	 mpn_trace ("b", bp, bsize));
     178  
     179    ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
     180    ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
     181  
     182    /* b^0 == 1, including 0^0 == 1 */
     183    if (e == 0)
     184      {
     185        MPZ_NEWALLOC (r, 1)[0] = 1;
     186        SIZ(r) = 1;
     187        return;
     188      }
     189  
     190    /* 0^e == 0 apart from 0^0 above */
     191    if (bsize == 0)
     192      {
     193        SIZ(r) = 0;
     194        return;
     195      }
     196  
     197    /* Sign of the final result. */
     198    rneg = (bsize < 0 && (e & 1) != 0);
     199    bsize = ABS (bsize);
     200    TRACE (printf ("rneg %d\n", rneg));
     201  
     202    r_bp_overlap = (PTR(r) == bp);
     203  
     204    /* Strip low zero limbs from b. */
     205    rtwos_limbs = 0;
     206    for (blimb = *bp; blimb == 0; blimb = *++bp)
     207      {
     208        rtwos_limbs += e;
     209        bsize--; ASSERT (bsize >= 1);
     210      }
     211    TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
     212  
     213    /* Strip low zero bits from b. */
     214    count_trailing_zeros (btwos, blimb);
     215    blimb >>= btwos;
     216    rtwos_bits = e * btwos;
     217    rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
     218    rtwos_bits %= GMP_NUMB_BITS;
     219    TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
     220  		 btwos, rtwos_limbs, rtwos_bits));
     221  
     222    TMP_MARK;
     223  
     224    rl = 1;
     225  #if HAVE_NATIVE_mpn_mul_2
     226    rl_high = 0;
     227  #endif
     228  
     229    if (bsize == 1)
     230      {
     231      bsize_1:
     232        /* Power up as far as possible within blimb.  We start here with e!=0,
     233  	 but if e is small then we might reach e==0 and the whole b^e in rl.
     234  	 Notice this code works when blimb==1 too, reaching e==0.  */
     235  
     236        while (blimb <= GMP_NUMB_HALFMAX)
     237  	{
     238  	  TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
     239  			 e, blimb, rl));
     240  	  ASSERT (e != 0);
     241  	  if ((e & 1) != 0)
     242  	    rl *= blimb;
     243  	  e >>= 1;
     244  	  if (e == 0)
     245  	    goto got_rl;
     246  	  blimb *= blimb;
     247  	}
     248  
     249  #if HAVE_NATIVE_mpn_mul_2
     250        TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
     251  		     e, blimb, rl));
     252  
     253        /* Can power b once more into blimb:blimb_low */
     254        bsize = 2;
     255        ASSERT (e != 0);
     256        if ((e & 1) != 0)
     257  	{
     258  	  umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
     259  	  rl >>= GMP_NAIL_BITS;
     260  	}
     261        e >>= 1;
     262        umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
     263        blimb_low >>= GMP_NAIL_BITS;
     264  
     265      got_rl:
     266        TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
     267  		     e, blimb, blimb_low, rl_high, rl));
     268  
     269        /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
     270  	 final mul_1 or mul_2 rather than a separate lshift.
     271  	 - rl_high:rl mustn't be 1 (since then there's no final mul)
     272  	 - rl_high mustn't overflow
     273  	 - rl_high mustn't change to non-zero, since mul_1+lshift is
     274  	 probably faster than mul_2 (FIXME: is this true?)  */
     275  
     276        if (rtwos_bits != 0
     277  	  && ! (rl_high == 0 && rl == 1)
     278  	  && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
     279  	{
     280  	  mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
     281  	    | (rl >> (GMP_NUMB_BITS-rtwos_bits));
     282  	  if (! (rl_high == 0 && new_rl_high != 0))
     283  	    {
     284  	      rl_high = new_rl_high;
     285  	      rl <<= rtwos_bits;
     286  	      rtwos_bits = 0;
     287  	      TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
     288  			     rl_high, rl));
     289  	    }
     290  	}
     291  #else
     292      got_rl:
     293        TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
     294  		     e, blimb, rl));
     295  
     296        /* Combine left-over rtwos_bits into rl to be handled by the final
     297  	 mul_1 rather than a separate lshift.
     298  	 - rl mustn't be 1 (since then there's no final mul)
     299  	 - rl mustn't overflow	*/
     300  
     301        if (rtwos_bits != 0
     302  	  && rl != 1
     303  	  && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
     304  	{
     305  	  rl <<= rtwos_bits;
     306  	  rtwos_bits = 0;
     307  	  TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
     308  	}
     309  #endif
     310      }
     311    else if (bsize == 2)
     312      {
     313        mp_limb_t  bsecond = bp[1];
     314        if (btwos != 0)
     315  	blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
     316        bsecond >>= btwos;
     317        if (bsecond == 0)
     318  	{
     319  	  /* Two limbs became one after rshift. */
     320  	  bsize = 1;
     321  	  goto bsize_1;
     322  	}
     323  
     324        TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
     325  #if HAVE_NATIVE_mpn_mul_2
     326        blimb_low = blimb;
     327  #else
     328        bp = b_twolimbs;
     329        b_twolimbs[0] = blimb;
     330        b_twolimbs[1] = bsecond;
     331  #endif
     332        blimb = bsecond;
     333      }
     334    else
     335      {
     336        if (r_bp_overlap || btwos != 0)
     337  	{
     338  	  mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
     339  	  MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
     340  	  bp = tp;
     341  	  TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
     342  	}
     343  #if HAVE_NATIVE_mpn_mul_2
     344        /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
     345        blimb_low = bp[0];
     346  #endif
     347        blimb = bp[bsize-1];
     348  
     349        TRACE (printf ("big bsize=%ld  ", bsize);
     350  	     mpn_trace ("b", bp, bsize));
     351      }
     352  
     353    /* At this point blimb is the most significant limb of the base to use.
     354  
     355       Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
     356       limb to round up the division; +1 for multiplies all using an extra
     357       limb over the true size; +2 for rl at the end; +1 for lshift at the
     358       end.
     359  
     360       The size calculation here is reasonably accurate.  The base is at least
     361       half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
     362       when it will power up as just over 16, an overestimate of 17/16 =
     363       6.25%.  For a 64-bit limb it's half that.
     364  
     365       If e==0 then blimb won't be anything useful (though it will be
     366       non-zero), but that doesn't matter since we just end up with ralloc==5,
     367       and that's fine for 2 limbs of rl and 1 of lshift.  */
     368  
     369    ASSERT (blimb != 0);
     370    count_leading_zeros (cnt, blimb);
     371    ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
     372    TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
     373  		 ralloc, bsize, blimb, cnt));
     374    rp = MPZ_NEWALLOC (r, ralloc + rtwos_limbs);
     375  
     376    /* Low zero limbs resulting from powers of 2. */
     377    MPN_ZERO (rp, rtwos_limbs);
     378    rp += rtwos_limbs;
     379  
     380    if (e == 0)
     381      {
     382        /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
     383  	 start. */
     384        rp[0] = rl;
     385        rsize = 1;
     386  #if HAVE_NATIVE_mpn_mul_2
     387        rp[1] = rl_high;
     388        rsize += (rl_high != 0);
     389  #endif
     390        ASSERT (rp[rsize-1] != 0);
     391      }
     392    else
     393      {
     394        mp_ptr     tp;
     395        mp_size_t  talloc;
     396  
     397        /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
     398  	 low bit of e is zero, tp only has to hold the second last power
     399  	 step, which is half the size of the final result.  There's no need
     400  	 to round up the divide by 2, since ralloc includes a +2 for rl
     401  	 which not needed by tp.  In the mpn_mul loop when the low bit of e
     402  	 is 1, tp must hold nearly the full result, so just size it the same
     403  	 as rp.  */
     404  
     405        talloc = ralloc;
     406  #if HAVE_NATIVE_mpn_mul_2
     407        if (bsize <= 2 || (e & 1) == 0)
     408  	talloc /= 2;
     409  #else
     410        if (bsize <= 1 || (e & 1) == 0)
     411  	talloc /= 2;
     412  #endif
     413        TRACE (printf ("talloc %ld\n", talloc));
     414        tp = TMP_ALLOC_LIMBS (talloc);
     415  
     416        /* Go from high to low over the bits of e, starting with i pointing at
     417  	 the bit below the highest 1 (which will mean i==-1 if e==1).  */
     418        count_leading_zeros (cnt, (mp_limb_t) e);
     419        i = GMP_LIMB_BITS - cnt - 2;
     420  
     421  #if HAVE_NATIVE_mpn_mul_2
     422        if (bsize <= 2)
     423  	{
     424  	  mp_limb_t  mult[2];
     425  
     426  	  /* Any bsize==1 will have been powered above to be two limbs. */
     427  	  ASSERT (bsize == 2);
     428  	  ASSERT (blimb != 0);
     429  
     430  	  /* Arrange the final result ends up in r, not in the temp space */
     431  	  if ((i & 1) == 0)
     432  	    SWAP_RP_TP;
     433  
     434  	  rp[0] = blimb_low;
     435  	  rp[1] = blimb;
     436  	  rsize = 2;
     437  
     438  	  mult[0] = blimb_low;
     439  	  mult[1] = blimb;
     440  
     441  	  for ( ; i >= 0; i--)
     442  	    {
     443  	      TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
     444  			     i, e, rsize, ralloc, talloc);
     445  		     mpn_trace ("r", rp, rsize));
     446  
     447  	      MPN_SQR (tp, talloc, rp, rsize);
     448  	      SWAP_RP_TP;
     449  	      if ((e & (1L << i)) != 0)
     450  		MPN_MUL_2 (rp, rsize, ralloc, mult);
     451  	    }
     452  
     453  	  TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
     454  	  if (rl_high != 0)
     455  	    {
     456  	      mult[0] = rl;
     457  	      mult[1] = rl_high;
     458  	      MPN_MUL_2 (rp, rsize, ralloc, mult);
     459  	    }
     460  	  else if (rl != 1)
     461  	    MPN_MUL_1 (rp, rsize, ralloc, rl);
     462  	}
     463  #else
     464        if (bsize == 1)
     465  	{
     466  	  /* Arrange the final result ends up in r, not in the temp space */
     467  	  if ((i & 1) == 0)
     468  	    SWAP_RP_TP;
     469  
     470  	  rp[0] = blimb;
     471  	  rsize = 1;
     472  
     473  	  for ( ; i >= 0; i--)
     474  	    {
     475  	      TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
     476  			     i, e, rsize, ralloc, talloc);
     477  		     mpn_trace ("r", rp, rsize));
     478  
     479  	      MPN_SQR (tp, talloc, rp, rsize);
     480  	      SWAP_RP_TP;
     481  	      if ((e & (1L << i)) != 0)
     482  		MPN_MUL_1 (rp, rsize, ralloc, blimb);
     483  	    }
     484  
     485  	  TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
     486  	  if (rl != 1)
     487  	    MPN_MUL_1 (rp, rsize, ralloc, rl);
     488  	}
     489  #endif
     490        else
     491  	{
     492  	  int  parity;
     493  
     494  	  /* Arrange the final result ends up in r, not in the temp space */
     495  	  ULONG_PARITY (parity, e);
     496  	  if (((parity ^ i) & 1) != 0)
     497  	    SWAP_RP_TP;
     498  
     499  	  MPN_COPY (rp, bp, bsize);
     500  	  rsize = bsize;
     501  
     502  	  for ( ; i >= 0; i--)
     503  	    {
     504  	      TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
     505  			     i, e, rsize, ralloc, talloc);
     506  		     mpn_trace ("r", rp, rsize));
     507  
     508  	      MPN_SQR (tp, talloc, rp, rsize);
     509  	      SWAP_RP_TP;
     510  	      if ((e & (1L << i)) != 0)
     511  		{
     512  		  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
     513  		  SWAP_RP_TP;
     514  		}
     515  	    }
     516  	}
     517      }
     518  
     519    ASSERT (rp == PTR(r) + rtwos_limbs);
     520    TRACE (mpn_trace ("end loop r", rp, rsize));
     521    TMP_FREE;
     522  
     523    /* Apply any partial limb factors of 2. */
     524    if (rtwos_bits != 0)
     525      {
     526        MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
     527        TRACE (mpn_trace ("lshift r", rp, rsize));
     528      }
     529  
     530    rsize += rtwos_limbs;
     531    SIZ(r) = (rneg ? -rsize : rsize);
     532  }