1  /* UltraSPARC 64 mpn_mod_1 -- mpn by limb remainder.
       2  
       3  Copyright 1991, 1993, 1994, 1999-2001, 2003, 2010 Free Software Foundation,
       4  Inc.
       5  
       6  This file is part of the GNU MP Library.
       7  
       8  The GNU MP Library is free software; you can redistribute it and/or modify
       9  it under the terms of either:
      10  
      11    * the GNU Lesser General Public License as published by the Free
      12      Software Foundation; either version 3 of the License, or (at your
      13      option) any later version.
      14  
      15  or
      16  
      17    * the GNU General Public License as published by the Free Software
      18      Foundation; either version 2 of the License, or (at your option) any
      19      later version.
      20  
      21  or both in parallel, as here.
      22  
      23  The GNU MP Library is distributed in the hope that it will be useful, but
      24  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      25  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      26  for more details.
      27  
      28  You should have received copies of the GNU General Public License and the
      29  GNU Lesser General Public License along with the GNU MP Library.  If not,
      30  see https://www.gnu.org/licenses/.  */
      31  
      32  #include "gmp-impl.h"
      33  #include "longlong.h"
      34  
      35  #include "mpn/sparc64/sparc64.h"
      36  
      37  
      38  /*                 64-bit divisor   32-bit divisor
      39                      cycles/limb      cycles/limb
      40                       (approx)         (approx)
      41     Ultrasparc 2i:      160               120
      42  */
      43  
      44  
      45  /* 32-bit divisors are treated in special case code.  This requires 4 mulx
      46     per limb instead of 8 in the general case.
      47  
      48     For big endian systems we need HALF_ENDIAN_ADJ included in the src[i]
      49     addressing, to get the two halves of each limb read in the correct order.
      50     This is kept in an adj variable.  Doing that measures about 6 c/l faster
      51     than just writing HALF_ENDIAN_ADJ(i) in the loop.  The latter shouldn't
      52     be 6 cycles worth of work, but perhaps it doesn't schedule well (on gcc
      53     3.2.1 at least).
      54  
      55     A simple udivx/umulx loop for the 32-bit case was attempted for small
      56     sizes, but at size==2 it was only about the same speed and at size==3 was
      57     slower.  */
      58  
      59  static mp_limb_t
      60  mpn_mod_1_anynorm (mp_srcptr src_limbptr, mp_size_t size_limbs, mp_limb_t d_limb)
      61  {
      62    int        norm, norm_rshift;
      63    mp_limb_t  src_high_limb;
      64    mp_size_t  i;
      65  
      66    ASSERT (size_limbs >= 0);
      67    ASSERT (d_limb != 0);
      68  
      69    if (UNLIKELY (size_limbs == 0))
      70      return 0;
      71  
      72    src_high_limb = src_limbptr[size_limbs-1];
      73  
      74    /* udivx is good for size==1, and no need to bother checking limb<divisor,
      75       since if that's likely the caller should check */
      76    if (UNLIKELY (size_limbs == 1))
      77      return src_high_limb % d_limb;
      78  
      79    if (d_limb <= CNST_LIMB(0xFFFFFFFF))
      80      {
      81        unsigned   *src, n1, n0, r, dummy_q, nshift, norm_rmask;
      82        mp_size_t  size, adj;
      83        mp_limb_t  dinv_limb;
      84  
      85        size = 2 * size_limbs;    /* halfwords */
      86        src = (unsigned *) src_limbptr;
      87  
      88        /* prospective initial remainder, if < d */
      89        r = src_high_limb >> 32;
      90  
      91        /* If the length of the source is uniformly distributed, then there's
      92           a 50% chance of the high 32-bits being zero, which we can skip.  */
      93        if (r == 0)
      94          {
      95            r = (unsigned) src_high_limb;
      96            size--;
      97            ASSERT (size > 0);  /* because always even */
      98          }
      99  
     100        /* Skip a division if high < divisor.  Having the test here before
     101           normalizing will still skip as often as possible.  */
     102        if (r < d_limb)
     103          {
     104            size--;
     105            ASSERT (size > 0);  /* because size==1 handled above */
     106          }
     107        else
     108          r = 0;
     109  
     110        count_leading_zeros_32 (norm, d_limb);
     111        norm -= 32;
     112        d_limb <<= norm;
     113  
     114        norm_rshift = 32 - norm;
     115        norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
     116        i = size-1;
     117        adj = HALF_ENDIAN_ADJ (i);
     118        n1 = src [i + adj];
     119        r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
     120  
     121        invert_half_limb (dinv_limb, d_limb);
     122        adj = -adj;
     123  
     124        for (i--; i >= 0; i--)
     125          {
     126            n0 = src [i + adj];
     127            adj = -adj;
     128            nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
     129            udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
     130            n1 = n0;
     131          }
     132  
     133        /* same as loop, but without n0 */
     134        nshift = n1 << norm;
     135        udiv_qrnnd_half_preinv (dummy_q, r, r, nshift, d_limb, dinv_limb);
     136  
     137        ASSERT ((r & ((1 << norm) - 1)) == 0);
     138        return r >> norm;
     139      }
     140    else
     141      {
     142        mp_srcptr  src;
     143        mp_size_t  size;
     144        mp_limb_t  n1, n0, r, dinv, dummy_q, nshift, norm_rmask;
     145  
     146        src = src_limbptr;
     147        size = size_limbs;
     148        r = src_high_limb;  /* initial remainder */
     149  
     150        /* Skip a division if high < divisor.  Having the test here before
     151           normalizing will still skip as often as possible.  */
     152        if (r < d_limb)
     153          {
     154            size--;
     155            ASSERT (size > 0);  /* because size==1 handled above */
     156          }
     157        else
     158          r = 0;
     159  
     160        count_leading_zeros (norm, d_limb);
     161        d_limb <<= norm;
     162  
     163        norm_rshift = GMP_LIMB_BITS - norm;
     164        norm_rmask = (norm == 0 ? 0 : 0xFFFFFFFF);
     165  
     166        src += size;
     167        n1 = *--src;
     168        r = (r << norm) | ((n1 >> norm_rshift) & norm_rmask);
     169  
     170        invert_limb (dinv, d_limb);
     171  
     172        for (i = size-2; i >= 0; i--)
     173          {
     174            n0 = *--src;
     175            nshift = (n1 << norm) | ((n0 >> norm_rshift) & norm_rmask);
     176            udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
     177            n1 = n0;
     178          }
     179  
     180        /* same as loop, but without n0 */
     181        nshift = n1 << norm;
     182        udiv_qrnnd_preinv (dummy_q, r, r, nshift, d_limb, dinv);
     183  
     184        ASSERT ((r & ((CNST_LIMB(1) << norm) - 1)) == 0);
     185        return r >> norm;
     186      }
     187  }
     188  
     189  mp_limb_t
     190  mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b)
     191  {
     192    ASSERT (n >= 0);
     193    ASSERT (b != 0);
     194  
     195    /* Should this be handled at all?  Rely on callers?  Note un==0 is currently
     196       required by mpz/fdiv_r_ui.c and possibly other places.  */
     197    if (n == 0)
     198      return 0;
     199  
     200    if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0))
     201      {
     202        if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD))
     203  	{
     204  	  return mpn_mod_1_anynorm (ap, n, b);
     205  	}
     206        else
     207  	{
     208  	  mp_limb_t pre[4];
     209  	  mpn_mod_1_1p_cps (pre, b);
     210  	  return mpn_mod_1_1p (ap, n, b, pre);
     211  	}
     212      }
     213    else
     214      {
     215        if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD))
     216  	{
     217  	  return mpn_mod_1_anynorm (ap, n, b);
     218  	}
     219        else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD))
     220  	{
     221  	  mp_limb_t pre[4];
     222  	  mpn_mod_1_1p_cps (pre, b);
     223  	  return mpn_mod_1_1p (ap, n, b << pre[1], pre);
     224  	}
     225        else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4))
     226  	{
     227  	  mp_limb_t pre[5];
     228  	  mpn_mod_1s_2p_cps (pre, b);
     229  	  return mpn_mod_1s_2p (ap, n, b << pre[1], pre);
     230  	}
     231        else
     232  	{
     233  	  mp_limb_t pre[7];
     234  	  mpn_mod_1s_4p_cps (pre, b);
     235  	  return mpn_mod_1s_4p (ap, n, b << pre[1], pre);
     236  	}
     237      }
     238  }