1  /* mpn_divexact(qp,np,nn,dp,dn,tp) -- Divide N = {np,nn} by D = {dp,dn} storing
       2     the result in Q = {qp,nn-dn+1} expecting no remainder.  Overlap allowed
       3     between Q and N; all other overlap disallowed.
       4  
       5     Contributed to the GNU project by Torbjorn Granlund.
       6  
       7     THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
       8     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       9     GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
      10  
      11  Copyright 2006, 2007, 2009, 2017 Free Software Foundation, Inc.
      12  
      13  This file is part of the GNU MP Library.
      14  
      15  The GNU MP Library is free software; you can redistribute it and/or modify
      16  it under the terms of either:
      17  
      18    * the GNU Lesser General Public License as published by the Free
      19      Software Foundation; either version 3 of the License, or (at your
      20      option) any later version.
      21  
      22  or
      23  
      24    * the GNU General Public License as published by the Free Software
      25      Foundation; either version 2 of the License, or (at your option) any
      26      later version.
      27  
      28  or both in parallel, as here.
      29  
      30  The GNU MP Library is distributed in the hope that it will be useful, but
      31  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      32  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      33  for more details.
      34  
      35  You should have received copies of the GNU General Public License and the
      36  GNU Lesser General Public License along with the GNU MP Library.  If not,
      37  see https://www.gnu.org/licenses/.  */
      38  
      39  
      40  #include "gmp-impl.h"
      41  #include "longlong.h"
      42  
      43  #if 1
      44  void
      45  mpn_divexact (mp_ptr qp,
      46  	      mp_srcptr np, mp_size_t nn,
      47  	      mp_srcptr dp, mp_size_t dn)
      48  {
      49    unsigned shift;
      50    mp_size_t qn;
      51    mp_ptr tp;
      52    TMP_DECL;
      53  
      54    ASSERT (dn > 0);
      55    ASSERT (nn >= dn);
      56    ASSERT (dp[dn-1] > 0);
      57  
      58    while (dp[0] == 0)
      59      {
      60        ASSERT (np[0] == 0);
      61        dp++;
      62        np++;
      63        dn--;
      64        nn--;
      65      }
      66  
      67    if (dn == 1)
      68      {
      69        MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]);
      70        return;
      71      }
      72  
      73    TMP_MARK;
      74  
      75    qn = nn + 1 - dn;
      76    count_trailing_zeros (shift, dp[0]);
      77  
      78    if (shift > 0)
      79      {
      80        mp_ptr wp;
      81        mp_size_t ss;
      82        ss = (dn > qn) ? qn + 1 : dn;
      83  
      84        tp = TMP_ALLOC_LIMBS (ss);
      85        mpn_rshift (tp, dp, ss, shift);
      86        dp = tp;
      87  
      88        /* Since we have excluded dn == 1, we have nn > qn, and we need
      89  	 to shift one limb beyond qn. */
      90        wp = TMP_ALLOC_LIMBS (qn + 1);
      91        mpn_rshift (wp, np, qn + 1, shift);
      92        np = wp;
      93      }
      94  
      95    if (dn > qn)
      96      dn = qn;
      97  
      98    tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn));
      99    mpn_bdiv_q (qp, np, qn, dp, dn, tp);
     100    TMP_FREE;
     101  
     102    /* Since bdiv_q computes -N/D (mod B^{qn}), we must negate now. */
     103    mpn_neg (qp, qp, qn);
     104  }
     105  
     106  #else
     107  
     108  /* We use the Jebelean's bidirectional exact division algorithm.  This is
     109     somewhat naively implemented, with equal quotient parts done by 2-adic
     110     division and truncating division.  Since 2-adic division is faster, it
     111     should be used for a larger chunk.
     112  
     113     This code is horrendously ugly, in all sorts of ways.
     114  
     115     * It was hacked without much care or thought, but with a testing program.
     116     * It handles scratch space frivolously, and furthermore the itch function
     117       is broken.
     118     * Doesn't provide any measures to deal with mu_divappr_q's +3 error.  We
     119       have yet to provoke an error due to this, though.
     120     * Algorithm selection leaves a lot to be desired.  In particular, the choice
     121       between DC and MU isn't a point, but we treat it like one.
     122     * It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of
     123       that the latter is faster.  We should at least reverse this, but perhaps
     124       we should make the lsb part considerably larger.  (How do we tune this?)
     125  */
     126  
     127  mp_size_t
     128  mpn_divexact_itch (mp_size_t nn, mp_size_t dn)
     129  {
     130    return nn + dn;		/* FIXME this is not right */
     131  }
     132  
     133  void
     134  mpn_divexact (mp_ptr qp,
     135  	      mp_srcptr np, mp_size_t nn,
     136  	      mp_srcptr dp, mp_size_t dn,
     137  	      mp_ptr scratch)
     138  {
     139    mp_size_t qn;
     140    mp_size_t nn0, qn0;
     141    mp_size_t nn1, qn1;
     142    mp_ptr tp;
     143    mp_limb_t qml;
     144    mp_limb_t qh;
     145    int cnt;
     146    mp_ptr xdp;
     147    mp_limb_t di;
     148    mp_limb_t cy;
     149    gmp_pi1_t dinv;
     150    TMP_DECL;
     151  
     152    TMP_MARK;
     153  
     154    qn = nn - dn + 1;
     155  
     156    /* For small divisors, and small quotients, don't use Jebelean's algorithm. */
     157    if (dn < DIVEXACT_JEB_THRESHOLD || qn < DIVEXACT_JEB_THRESHOLD)
     158      {
     159        tp = scratch;
     160        MPN_COPY (tp, np, qn);
     161        binvert_limb (di, dp[0]);  di = -di;
     162        dn = MIN (dn, qn);
     163        mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
     164        TMP_FREE;
     165        return;
     166      }
     167  
     168    qn0 = ((nn - dn) >> 1) + 1;	/* low quotient size */
     169  
     170    /* If quotient is much larger than the divisor, the bidirectional algorithm
     171       does not work as currently implemented.  Fall back to plain bdiv.  */
     172    if (qn0 > dn)
     173      {
     174        if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD))
     175  	{
     176  	  tp = scratch;
     177  	  MPN_COPY (tp, np, qn);
     178  	  binvert_limb (di, dp[0]);  di = -di;
     179  	  dn = MIN (dn, qn);
     180  	  mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di);
     181  	}
     182        else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD))
     183  	{
     184  	  tp = scratch;
     185  	  MPN_COPY (tp, np, qn);
     186  	  binvert_limb (di, dp[0]);  di = -di;
     187  	  mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di);
     188  	}
     189        else
     190  	{
     191  	  mpn_mu_bdiv_q (qp, np, qn, dp, dn, scratch);
     192  	}
     193        TMP_FREE;
     194        return;
     195      }
     196  
     197    nn0 = qn0 + qn0;
     198  
     199    nn1 = nn0 - 1 + ((nn-dn) & 1);
     200    qn1 = qn0;
     201    if (LIKELY (qn0 != dn))
     202      {
     203        nn1 = nn1 + 1;
     204        qn1 = qn1 + 1;
     205        if (UNLIKELY (dp[dn - 1] == 1 && qn1 != dn))
     206  	{
     207  	  /* If the leading divisor limb == 1, i.e. has just one bit, we have
     208  	     to include an extra limb in order to get the needed overlap.  */
     209  	  /* FIXME: Now with the mu_divappr_q function, we should really need
     210  	     more overlap. That indicates one of two things: (1) The test code
     211  	     is not good. (2) We actually overlap too much by default.  */
     212  	  nn1 = nn1 + 1;
     213  	  qn1 = qn1 + 1;
     214  	}
     215      }
     216  
     217    tp = TMP_ALLOC_LIMBS (nn1 + 1);
     218  
     219    count_leading_zeros (cnt, dp[dn - 1]);
     220  
     221    /* Normalize divisor, store into tmp area.  */
     222    if (cnt != 0)
     223      {
     224        xdp = TMP_ALLOC_LIMBS (qn1);
     225        mpn_lshift (xdp, dp + dn - qn1, qn1, cnt);
     226      }
     227    else
     228      {
     229        xdp = (mp_ptr) dp + dn - qn1;
     230      }
     231  
     232    /* Shift dividend according to the divisor normalization.  */
     233    /* FIXME: We compute too much here for XX_divappr_q, but these functions'
     234       interfaces want a pointer to the imaginative least significant limb, not
     235       to the least significant *used* limb.  Of course, we could leave nn1-qn1
     236       rubbish limbs in the low part, to save some time.  */
     237    if (cnt != 0)
     238      {
     239        cy = mpn_lshift (tp, np + nn - nn1, nn1, cnt);
     240        if (cy != 0)
     241  	{
     242  	  tp[nn1] = cy;
     243  	  nn1++;
     244  	}
     245      }
     246    else
     247      {
     248        /* FIXME: This copy is not needed for mpn_mu_divappr_q, except when the
     249  	 mpn_sub_n right before is executed.  */
     250        MPN_COPY (tp, np + nn - nn1, nn1);
     251      }
     252  
     253    invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]);
     254    if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD))
     255      {
     256        qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32);
     257      }
     258    else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD))
     259      {
     260        qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv);
     261      }
     262    else
     263      {
     264        /* FIXME: mpn_mu_divappr_q doesn't handle qh != 0.  Work around it with a
     265  	 conditional subtraction here.  */
     266        qh = mpn_cmp (tp + nn1 - qn1, xdp, qn1) >= 0;
     267        if (qh)
     268  	mpn_sub_n (tp + nn1 - qn1, tp + nn1 - qn1, xdp, qn1);
     269        mpn_mu_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, scratch);
     270        qp[qn0 - 1 + nn1 - qn1] = qh;
     271      }
     272    qml = qp[qn0 - 1];
     273  
     274    binvert_limb (di, dp[0]);  di = -di;
     275  
     276    if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD))
     277      {
     278        MPN_COPY (tp, np, qn0);
     279        mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
     280      }
     281    else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD))
     282      {
     283        MPN_COPY (tp, np, qn0);
     284        mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di);
     285      }
     286    else
     287      {
     288        mpn_mu_bdiv_q (qp, np, qn0, dp, qn0, scratch);
     289      }
     290  
     291    if (qml < qp[qn0 - 1])
     292      mpn_decr_u (qp + qn0, 1);
     293  
     294    TMP_FREE;
     295  }
     296  #endif