1  /* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q.
       2  
       3     Compute Q = floor(N / D) + e.  N is nn limbs, D is dn limbs and must be
       4     normalized, and Q must be nn-dn limbs, 0 <= e <= 4.  The requirement that Q
       5     is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us
       6     to let N be unmodified during the operation.
       7  
       8     Contributed to the GNU project by Torbjorn Granlund.
       9  
      10     THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
      11     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
      12     GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
      13  
      14  Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc.
      15  
      16  This file is part of the GNU MP Library.
      17  
      18  The GNU MP Library is free software; you can redistribute it and/or modify
      19  it under the terms of either:
      20  
      21    * the GNU Lesser General Public License as published by the Free
      22      Software Foundation; either version 3 of the License, or (at your
      23      option) any later version.
      24  
      25  or
      26  
      27    * the GNU General Public License as published by the Free Software
      28      Foundation; either version 2 of the License, or (at your option) any
      29      later version.
      30  
      31  or both in parallel, as here.
      32  
      33  The GNU MP Library is distributed in the hope that it will be useful, but
      34  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      35  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      36  for more details.
      37  
      38  You should have received copies of the GNU General Public License and the
      39  GNU Lesser General Public License along with the GNU MP Library.  If not,
      40  see https://www.gnu.org/licenses/.  */
      41  
      42  
      43  /*
      44     The idea of the algorithm used herein is to compute a smaller inverted value
      45     than used in the standard Barrett algorithm, and thus save time in the
      46     Newton iterations, and pay just a small price when using the inverted value
      47     for developing quotient bits.  This algorithm was presented at ICMS 2006.
      48  */
      49  
      50  /* CAUTION: This code and the code in mu_div_qr.c should be edited in sync.
      51  
      52   Things to work on:
      53  
      54    * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
      55      demonstrated by the fact that the mpn_invertappr function's scratch needs
      56      mean that we need to keep a large allocation long after it is needed.
      57      Things are worse as mpn_mul_fft does not accept any scratch parameter,
      58      which means we'll have a large memory hole while in mpn_mul_fft.  In
      59      general, a peak scratch need in the beginning of a function isn't
      60      well-handled by the itch/scratch scheme.
      61  */
      62  
      63  #ifdef STAT
      64  #undef STAT
      65  #define STAT(x) x
      66  #else
      67  #define STAT(x)
      68  #endif
      69  
      70  #include <stdlib.h>		/* for NULL */
      71  #include "gmp-impl.h"
      72  
      73  static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t,
      74  			 mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr);
      75  static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int);
      76  
      77  mp_limb_t
      78  mpn_mu_divappr_q (mp_ptr qp,
      79  		  mp_srcptr np,
      80  		  mp_size_t nn,
      81  		  mp_srcptr dp,
      82  		  mp_size_t dn,
      83  		  mp_ptr scratch)
      84  {
      85    mp_size_t qn, in;
      86    mp_limb_t cy, qh;
      87    mp_ptr ip, tp;
      88  
      89    ASSERT (dn > 1);
      90  
      91    qn = nn - dn;
      92  
      93    /* If Q is smaller than D, truncate operands. */
      94    if (qn + 1 < dn)
      95      {
      96        np += dn - (qn + 1);
      97        nn -= dn - (qn + 1);
      98        dp += dn - (qn + 1);
      99        dn = qn + 1;
     100      }
     101  
     102    /* Compute the inverse size.  */
     103    in = mpn_mu_divappr_q_choose_in (qn, dn, 0);
     104    ASSERT (in <= dn);
     105  
     106  #if 1
     107    /* This alternative inverse computation method gets slightly more accurate
     108       results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
     109       not adapted (3) mpn_invertappr scratch needs not met.  */
     110    ip = scratch;
     111    tp = scratch + in + 1;
     112  
     113    /* compute an approximate inverse on (in+1) limbs */
     114    if (dn == in)
     115      {
     116        MPN_COPY (tp + 1, dp, in);
     117        tp[0] = 1;
     118        mpn_invertappr (ip, tp, in + 1, tp + in + 1);
     119        MPN_COPY_INCR (ip, ip + 1, in);
     120      }
     121    else
     122      {
     123        cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
     124        if (UNLIKELY (cy != 0))
     125  	MPN_ZERO (ip, in);
     126        else
     127  	{
     128  	  mpn_invertappr (ip, tp, in + 1, tp + in + 1);
     129  	  MPN_COPY_INCR (ip, ip + 1, in);
     130  	}
     131      }
     132  #else
     133    /* This older inverse computation method gets slightly worse results than the
     134       one above.  */
     135    ip = scratch;
     136    tp = scratch + in;
     137  
     138    /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
     139       inversion function should do this automatically.  */
     140    if (dn == in)
     141      {
     142        tp[in + 1] = 0;
     143        MPN_COPY (tp + in + 2, dp, in);
     144        mpn_invertappr (tp, tp + in + 1, in + 1, NULL);
     145      }
     146    else
     147      {
     148        mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL);
     149      }
     150    cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
     151    if (UNLIKELY (cy != 0))
     152      MPN_ZERO (tp + 1, in);
     153    MPN_COPY (ip, tp + 1, in);
     154  #endif
     155  
     156    qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in);
     157  
     158    return qh;
     159  }
     160  
     161  static mp_limb_t
     162  mpn_preinv_mu_divappr_q (mp_ptr qp,
     163  			 mp_srcptr np,
     164  			 mp_size_t nn,
     165  			 mp_srcptr dp,
     166  			 mp_size_t dn,
     167  			 mp_srcptr ip,
     168  			 mp_size_t in,
     169  			 mp_ptr scratch)
     170  {
     171    mp_size_t qn;
     172    mp_limb_t cy, cx, qh;
     173    mp_limb_t r;
     174    mp_size_t tn, wn;
     175  
     176  #define rp           scratch
     177  #define tp           (scratch + dn)
     178  #define scratch_out  (scratch + dn + tn)
     179  
     180    qn = nn - dn;
     181  
     182    np += qn;
     183    qp += qn;
     184  
     185    qh = mpn_cmp (np, dp, dn) >= 0;
     186    if (qh != 0)
     187      mpn_sub_n (rp, np, dp, dn);
     188    else
     189      MPN_COPY (rp, np, dn);
     190  
     191    if (UNLIKELY (qn == 0))
     192      return qh;			/* Degenerate use.  Should we allow this? */
     193  
     194    for (;;) /* The exit condition (qn == 0) is verified in the loop. */
     195      {
     196        if (qn < in)
     197  	{
     198  	  ip += in - qn;
     199  	  in = qn;
     200  	}
     201        np -= in;
     202        qp -= in;
     203  
     204        /* Compute the next block of quotient limbs by multiplying the inverse I
     205  	 by the upper part of the partial remainder R.  */
     206        mpn_mul_n (tp, rp + dn - in, ip, in);		/* mulhi  */
     207        cy = mpn_add_n (qp, tp + in, rp + dn - in, in);	/* I's msb implicit */
     208        ASSERT_ALWAYS (cy == 0);
     209  
     210        qn -= in;
     211        if (qn == 0)
     212  	break;
     213  
     214        /* Compute the product of the quotient block and the divisor D, to be
     215  	 subtracted from the partial remainder combined with new limbs from the
     216  	 dividend N.  We only really need the low dn limbs.  */
     217  
     218        if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
     219  	mpn_mul (tp, dp, dn, qp, in);		/* dn+in limbs, high 'in' cancels */
     220        else
     221  	{
     222  	  tn = mpn_mulmod_bnm1_next_size (dn + 1);
     223  	  mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
     224  	  wn = dn + in - tn;			/* number of wrapped limbs */
     225  	  if (wn > 0)
     226  	    {
     227  	      cy = mpn_sub_n (tp, tp, rp + dn - wn, wn);
     228  	      cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy);
     229  	      cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0;
     230  	      ASSERT_ALWAYS (cx >= cy);
     231  	      mpn_incr_u (tp, cx - cy);
     232  	    }
     233  	}
     234  
     235        r = rp[dn - in] - tp[dn];
     236  
     237        /* Subtract the product from the partial remainder combined with new
     238  	 limbs from the dividend N, generating a new partial remainder R.  */
     239        if (dn != in)
     240  	{
     241  	  cy = mpn_sub_n (tp, np, tp, in);	/* get next 'in' limbs from N */
     242  	  cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
     243  	  MPN_COPY (rp, tp, dn);		/* FIXME: try to avoid this */
     244  	}
     245        else
     246  	{
     247  	  cy = mpn_sub_n (rp, np, tp, in);	/* get next 'in' limbs from N */
     248  	}
     249  
     250        STAT (int i; int err = 0;
     251  	    static int errarr[5]; static int err_rec; static int tot);
     252  
     253        /* Check the remainder R and adjust the quotient as needed.  */
     254        r -= cy;
     255        while (r != 0)
     256  	{
     257  	  /* We loop 0 times with about 69% probability, 1 time with about 31%
     258  	     probability, 2 times with about 0.6% probability, if inverse is
     259  	     computed as recommended.  */
     260  	  mpn_incr_u (qp, 1);
     261  	  cy = mpn_sub_n (rp, rp, dp, dn);
     262  	  r -= cy;
     263  	  STAT (err++);
     264  	}
     265        if (mpn_cmp (rp, dp, dn) >= 0)
     266  	{
     267  	  /* This is executed with about 76% probability.  */
     268  	  mpn_incr_u (qp, 1);
     269  	  cy = mpn_sub_n (rp, rp, dp, dn);
     270  	  STAT (err++);
     271  	}
     272  
     273        STAT (
     274  	    tot++;
     275  	    errarr[err]++;
     276  	    if (err > err_rec)
     277  	      err_rec = err;
     278  	    if (tot % 0x10000 == 0)
     279  	      {
     280  		for (i = 0; i <= err_rec; i++)
     281  		  printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
     282  		printf ("\n");
     283  	      }
     284  	    );
     285      }
     286  
     287    /* FIXME: We should perhaps be somewhat more elegant in our rounding of the
     288       quotient.  For now, just make sure the returned quotient is >= the real
     289       quotient; add 3 with saturating arithmetic.  */
     290    qn = nn - dn;
     291    cy += mpn_add_1 (qp, qp, qn, 3);
     292    if (cy != 0)
     293      {
     294        if (qh != 0)
     295  	{
     296  	  /* Return a quotient of just 1-bits, with qh set.  */
     297  	  mp_size_t i;
     298  	  for (i = 0; i < qn; i++)
     299  	    qp[i] = GMP_NUMB_MAX;
     300  	}
     301        else
     302  	{
     303  	  /* Propagate carry into qh.  */
     304  	  qh = 1;
     305  	}
     306      }
     307  
     308    return qh;
     309  }
     310  
     311  /* In case k=0 (automatic choice), we distinguish 3 cases:
     312     (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
     313     (b) dn/3 < qn <= dn: in = ceil(qn / 2)
     314     (c) qn < dn/3:       in = qn
     315     In all cases we have in <= dn.
     316   */
     317  static mp_size_t
     318  mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k)
     319  {
     320    mp_size_t in;
     321  
     322    if (k == 0)
     323      {
     324        mp_size_t b;
     325        if (qn > dn)
     326  	{
     327  	  /* Compute an inverse size that is a nice partition of the quotient.  */
     328  	  b = (qn - 1) / dn + 1;	/* ceil(qn/dn), number of blocks */
     329  	  in = (qn - 1) / b + 1;	/* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
     330  	}
     331        else if (3 * qn > dn)
     332  	{
     333  	  in = (qn - 1) / 2 + 1;	/* b = 2 */
     334  	}
     335        else
     336  	{
     337  	  in = (qn - 1) / 1 + 1;	/* b = 1 */
     338  	}
     339      }
     340    else
     341      {
     342        mp_size_t xn;
     343        xn = MIN (dn, qn);
     344        in = (xn - 1) / k + 1;
     345      }
     346  
     347    return in;
     348  }
     349  
     350  mp_size_t
     351  mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k)
     352  {
     353    mp_size_t qn, in, itch_local, itch_out, itch_invapp;
     354  
     355    qn = nn - dn;
     356    if (qn + 1 < dn)
     357      {
     358        dn = qn + 1;
     359      }
     360    in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k);
     361  
     362    itch_local = mpn_mulmod_bnm1_next_size (dn + 1);
     363    itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in);
     364    itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */
     365  
     366    ASSERT (dn + itch_local + itch_out >= itch_invapp);
     367    return in + MAX (dn + itch_local + itch_out, itch_invapp);
     368  }