1  /* mpn_divexact_by3c -- mpn exact division by 3.
       2  
       3  Copyright 2000-2003, 2008 Free Software Foundation, Inc.
       4  
       5  This file is part of the GNU MP Library.
       6  
       7  The GNU MP Library is free software; you can redistribute it and/or modify
       8  it under the terms of either:
       9  
      10    * the GNU Lesser General Public License as published by the Free
      11      Software Foundation; either version 3 of the License, or (at your
      12      option) any later version.
      13  
      14  or
      15  
      16    * the GNU General Public License as published by the Free Software
      17      Foundation; either version 2 of the License, or (at your option) any
      18      later version.
      19  
      20  or both in parallel, as here.
      21  
      22  The GNU MP Library is distributed in the hope that it will be useful, but
      23  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      24  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      25  for more details.
      26  
      27  You should have received copies of the GNU General Public License and the
      28  GNU Lesser General Public License along with the GNU MP Library.  If not,
      29  see https://www.gnu.org/licenses/.  */
      30  
      31  #include "gmp-impl.h"
      32  
      33  #if DIVEXACT_BY3_METHOD == 0
      34  
      35  mp_limb_t
      36  mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c)
      37  {
      38    mp_limb_t r;
      39    r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c);
      40  
      41    /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3.
      42       We want to return C.  We compute the remainder mod 4 and notice that the
      43       inverse of (2^(2k)-1)/3 mod 4 is 1.  */
      44    return r & 3;
      45  }
      46  
      47  #endif
      48  
      49  #if DIVEXACT_BY3_METHOD == 1
      50  
      51  /* The algorithm here is basically the same as mpn_divexact_1, as described
      52     in the manual.  Namely at each step q = (src[i]-c)*inverse, and new c =
      53     borrow(src[i]-c) + high(divisor*q).  But because the divisor is just 3,
      54     high(divisor*q) can be determined with two comparisons instead of a
      55     multiply.
      56  
      57     The "c += ..."s add the high limb of 3*l to c.  That high limb will be 0,
      58     1 or 2.  Doing two separate "+="s seems to give better code on gcc (as of
      59     2.95.2 at least).
      60  
      61     It will be noted that the new c is formed by adding three values each 0
      62     or 1.  But the total is only 0, 1 or 2.  When the subtraction src[i]-c
      63     causes a borrow, that leaves a limb value of either 0xFF...FF or
      64     0xFF...FE.  The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or
      65     0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1
      66     respectively, hence a total of no more than 2.
      67  
      68     Alternatives:
      69  
      70     This implementation has each multiply on the dependent chain, due to
      71     "l=s-c".  See below for alternative code which avoids that.  */
      72  
      73  mp_limb_t
      74  mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c)
      75  {
      76    mp_limb_t  l, q, s;
      77    mp_size_t  i;
      78  
      79    ASSERT (un >= 1);
      80    ASSERT (c == 0 || c == 1 || c == 2);
      81    ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
      82  
      83    i = 0;
      84    do
      85      {
      86        s = up[i];
      87        SUBC_LIMB (c, l, s, c);
      88  
      89        q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
      90        rp[i] = q;
      91  
      92        c += (q >= GMP_NUMB_CEIL_MAX_DIV3);
      93        c += (q >= GMP_NUMB_CEIL_2MAX_DIV3);
      94      }
      95    while (++i < un);
      96  
      97    ASSERT (c == 0 || c == 1 || c == 2);
      98    return c;
      99  }
     100  
     101  
     102  #endif
     103  
     104  #if DIVEXACT_BY3_METHOD == 2
     105  
     106  /* The following alternative code re-arranges the quotient calculation from
     107     (src[i]-c)*inverse to instead
     108  
     109         q = src[i]*inverse - c*inverse
     110  
     111     thereby allowing src[i]*inverse to be scheduled back as far as desired,
     112     making full use of multiplier throughput and leaving just some carry
     113     handing on the dependent chain.
     114  
     115     The carry handling consists of determining the c for the next iteration.
     116     This is the same as described above, namely look for any borrow from
     117     src[i]-c, and at the high of 3*q.
     118  
     119     high(3*q) is done with two comparisons as above (in c2 and c3).  The
     120     borrow from src[i]-c is incorporated into those by noting that if there's
     121     a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn
     122     giving q = 0x55..55 or 0xAA..AA.  Adding 1 to either of those q values is
     123     enough to make high(3*q) come out 1 bigger, as required.
     124  
     125     l = -c*inverse is calculated at the same time as c, since for most chips
     126     it can be more conveniently derived from separate c1/c2/c3 values than
     127     from a combined c equal to 0, 1 or 2.
     128  
     129     The net effect is that with good pipelining this loop should be able to
     130     run at perhaps 4 cycles/limb, depending on available execute resources
     131     etc.
     132  
     133     Usage:
     134  
     135     This code is not used by default, since we really can't rely on the
     136     compiler generating a good software pipeline, nor on such an approach
     137     even being worthwhile on all CPUs.
     138  
     139     Itanium is one chip where this algorithm helps though, see
     140     mpn/ia64/diveby3.asm.  */
     141  
     142  mp_limb_t
     143  mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy)
     144  {
     145    mp_limb_t  s, sm, cl, q, qx, c2, c3;
     146    mp_size_t  i;
     147  
     148    ASSERT (un >= 1);
     149    ASSERT (cy == 0 || cy == 1 || cy == 2);
     150    ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un));
     151  
     152    cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3;
     153  
     154    for (i = 0; i < un; i++)
     155      {
     156        s = up[i];
     157        sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK;
     158  
     159        q = (cl + sm) & GMP_NUMB_MASK;
     160        rp[i] = q;
     161        qx = q + (s < cy);
     162  
     163        c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3;
     164        c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ;
     165  
     166        cy = c2 + c3;
     167        cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3);
     168      }
     169  
     170    return cy;
     171  }
     172  
     173  #endif