(root)/
gmp-6.3.0/
mpn/
generic/
hgcd2-div.h
       1  /* hgcd2-div.h
       2  
       3     THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
       4     SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
       5     GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
       6  
       7  Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software
       8  Foundation, Inc.
       9  
      10  This file is part of the GNU MP Library.
      11  
      12  The GNU MP Library is free software; you can redistribute it and/or modify
      13  it under the terms of either:
      14  
      15    * the GNU Lesser General Public License as published by the Free
      16      Software Foundation; either version 3 of the License, or (at your
      17      option) any later version.
      18  
      19  or
      20  
      21    * the GNU General Public License as published by the Free Software
      22      Foundation; either version 2 of the License, or (at your option) any
      23      later version.
      24  
      25  or both in parallel, as here.
      26  
      27  The GNU MP Library is distributed in the hope that it will be useful, but
      28  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      29  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      30  for more details.
      31  
      32  You should have received copies of the GNU General Public License and the
      33  GNU Lesser General Public License along with the GNU MP Library.  If not,
      34  see https://www.gnu.org/licenses/.  */
      35  
      36  #include "gmp-impl.h"
      37  #include "longlong.h"
      38  
      39  #ifndef HGCD2_DIV1_METHOD
      40  #define HGCD2_DIV1_METHOD 3
      41  #endif
      42  
      43  #ifndef HGCD2_DIV2_METHOD
      44  #define HGCD2_DIV2_METHOD 2
      45  #endif
      46  
      47  #if HAVE_NATIVE_mpn_div_11
      48  
      49  #define div1 mpn_div_11
      50  /* Single-limb division optimized for small quotients.
      51     Returned value holds d0 = r, d1 = q. */
      52  mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
      53  
      54  #elif HGCD2_DIV1_METHOD == 1
      55  
      56  static inline mp_double_limb_t
      57  div1 (mp_limb_t n0, mp_limb_t d0)
      58  {
      59    mp_double_limb_t res;
      60    res.d1 = n0 / d0;
      61    res.d0 = n0 - res.d1 * d0;
      62  
      63    return res;
      64  }
      65  
      66  #elif HGCD2_DIV1_METHOD == 2
      67  
      68  static mp_double_limb_t
      69  div1 (mp_limb_t n0, mp_limb_t d0)
      70  {
      71    mp_double_limb_t res;
      72    int ncnt, dcnt, cnt;
      73    mp_limb_t q;
      74    mp_limb_t mask;
      75  
      76    ASSERT (n0 >= d0);
      77  
      78    count_leading_zeros (ncnt, n0);
      79    count_leading_zeros (dcnt, d0);
      80    cnt = dcnt - ncnt;
      81  
      82    d0 <<= cnt;
      83  
      84    q = -(mp_limb_t) (n0 >= d0);
      85    n0 -= d0 & q;
      86    d0 >>= 1;
      87    q = -q;
      88  
      89    while (--cnt >= 0)
      90      {
      91        mask = -(mp_limb_t) (n0 >= d0);
      92        n0 -= d0 & mask;
      93        d0 >>= 1;
      94        q = (q << 1) - mask;
      95      }
      96  
      97    res.d0 = n0;
      98    res.d1 = q;
      99    return res;
     100  }
     101  
     102  #elif HGCD2_DIV1_METHOD == 3
     103  
     104  static inline mp_double_limb_t
     105  div1 (mp_limb_t n0, mp_limb_t d0)
     106  {
     107    mp_double_limb_t res;
     108    if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
     109        || UNLIKELY (n0 >= (d0 << 3)))
     110      {
     111        res.d1 = n0 / d0;
     112        res.d0 = n0 - res.d1 * d0;
     113      }
     114    else
     115      {
     116        mp_limb_t q, mask;
     117  
     118        d0 <<= 2;
     119  
     120        mask = -(mp_limb_t) (n0 >= d0);
     121        n0 -= d0 & mask;
     122        q = 4 & mask;
     123  
     124        d0 >>= 1;
     125        mask = -(mp_limb_t) (n0 >= d0);
     126        n0 -= d0 & mask;
     127        q += 2 & mask;
     128  
     129        d0 >>= 1;
     130        mask = -(mp_limb_t) (n0 >= d0);
     131        n0 -= d0 & mask;
     132        q -= mask;
     133  
     134        res.d0 = n0;
     135        res.d1 = q;
     136      }
     137    return res;
     138  }
     139  
     140  #elif HGCD2_DIV1_METHOD == 4
     141  
     142  /* Table quotients.  We extract the NBITS most significant bits of the
     143     numerator limb, and the corresponding bits from the divisor limb, and use
     144     these to form an index into the table.  This method is probably only useful
     145     for short pipelines with slow multiplication.
     146  
     147     Possible improvements:
     148  
     149     * Perhaps extract the highest NBITS of the divisor instead of the same bits
     150       as from the numerator.  That would require another count_leading_zeros,
     151       and a post-multiply shift of the quotient.
     152  
     153     * Compress tables?  Their values are tiny, and there are lots of zero
     154       entries (which are never used).
     155  
     156     * Round the table entries more cleverly?
     157  */
     158  
     159  #ifndef NBITS
     160  #define NBITS 5
     161  #endif
     162  
     163  #if NBITS == 5
     164  /* This needs full division about 13.2% of the time. */
     165  static const unsigned char tab[512] = {
     166  17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     167  18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     168  19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
     169  20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
     170  21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
     171  22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
     172  23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
     173  24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
     174  25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
     175  26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
     176  27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
     177  28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
     178  29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
     179  30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
     180  31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
     181  32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
     182  };
     183  #elif NBITS == 6
     184  /* This needs full division about 9.8% of the time. */
     185  static const unsigned char tab[2048] = {
     186  33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     187   1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     188  34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     189   1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     190  35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     191   1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     192  36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     193   1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     194  37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     195   1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     196  38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     197   1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     198  39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
     199   1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     200  40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
     201   1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     202  41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
     203   1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     204  42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
     205   1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     206  43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
     207   1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     208  44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
     209   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     210  45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
     211   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     212  46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
     213   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     214  47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
     215   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     216  48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
     217   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     218  49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
     219   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     220  50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
     221   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     222  51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
     223   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
     224  52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
     225   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
     226  53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
     227   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
     228  54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
     229   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
     230  55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
     231   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
     232  56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
     233   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
     234  57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
     235   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
     236  58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
     237   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
     238  59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
     239   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
     240  60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
     241   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
     242  61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
     243   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
     244  62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
     245   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
     246  63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
     247   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
     248  64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
     249   1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
     250  };
     251  #else
     252  #error No table for provided NBITS
     253  #endif
     254  
     255  /* Doing tabp with a #define makes compiler warnings about pointing outside an
     256     object go away.  We used to define this as a variable.  It is not clear if
     257     e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
     258     (vector[100] + 10) - 10 surely is and there is no sequence point so the
     259     expressions should be equivalent.  To make this safe, we might want to
     260     define tabp as a macro with the index as an argument.  Depending on the
     261     platform, relocs might allow for assembly-time or linker-time resolution to
     262     take place. */
     263  #define tabp (tab - (1 << (NBITS - 1) << NBITS))
     264  
     265  static inline mp_double_limb_t
     266  div1 (mp_limb_t n0, mp_limb_t d0)
     267  {
     268    int ncnt;
     269    size_t nbi, dbi;
     270    mp_limb_t q0;
     271    mp_limb_t r0;
     272    mp_limb_t mask;
     273    mp_double_limb_t res;
     274  
     275    ASSERT (n0 >= d0);		/* Actually only msb position is critical. */
     276  
     277    count_leading_zeros (ncnt, n0);
     278    nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
     279    dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
     280  
     281    q0 = tabp[(nbi << NBITS) + dbi];
     282    r0 = n0 - q0 * d0;
     283    mask = -(mp_limb_t) (r0 >= d0);
     284    q0 -= mask;
     285    r0 -= d0 & mask;
     286  
     287    if (UNLIKELY (r0 >= d0))
     288      {
     289        q0 = n0 / d0;
     290        r0 = n0 - q0 * d0;
     291      }
     292  
     293    res.d1 = q0;
     294    res.d0 = r0;
     295    return res;
     296  }
     297  
     298  #elif HGCD2_DIV1_METHOD == 5
     299  
     300  /* Table inverses of divisors.  We don't bother with suppressing the msb from
     301     the tables.  We index with the NBITS most significant divisor bits,
     302     including the always-set highest bit, but use addressing trickery via tabp
     303     to suppress it.
     304  
     305     Possible improvements:
     306  
     307     * Do first multiply using 32-bit operations on 64-bit computers.  At least
     308       on most Arm64 cores, that uses 3 times less resources.  It also saves on
     309       many x86-64 processors.
     310  */
     311  
     312  #ifndef NBITS
     313  #define NBITS 7
     314  #endif
     315  
     316  #if NBITS == 5
     317  /* This needs full division about 1.63% of the time. */
     318  static const unsigned char tab[16] = {
     319   63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
     320  };
     321  #elif NBITS == 6
     322  /* This needs full division about 0.93% of the time. */
     323  static const unsigned char tab[32] = {
     324  127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
     325   84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
     326  };
     327  #elif NBITS == 7
     328  /* This needs full division about 0.49% of the time. */
     329  static const unsigned char tab[64] = {
     330  255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
     331  203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
     332  169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
     333  145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
     334  };
     335  #elif NBITS == 8
     336  /* This needs full division about 0.26% of the time. */
     337  static const unsigned short tab[128] = {
     338  511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
     339  454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
     340  408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
     341  371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
     342  340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
     343  314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
     344  291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
     345  272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
     346  };
     347  #else
     348  #error No table for provided NBITS
     349  #endif
     350  
     351  /* Doing tabp with a #define makes compiler warnings about pointing outside an
     352     object go away.  We used to define this as a variable.  It is not clear if
     353     e.g.  (vector[100] - 10) + 10 is well- defined as per the C standard;
     354     (vector[100] + 10) - 10 surely is and there is no sequence point so the
     355     expressions should be equivalent.  To make this safe, we might want to
     356     define tabp as a macro with the index as an argument.  Depending on the
     357     platform, relocs might allow for assembly-time or linker-time resolution to
     358     take place. */
     359  #define tabp (tab - (1 << (NBITS - 1)))
     360  
     361  static inline mp_double_limb_t
     362  div1 (mp_limb_t n0, mp_limb_t d0)
     363  {
     364    int ncnt, dcnt;
     365    size_t dbi;
     366    mp_limb_t inv;
     367    mp_limb_t q0;
     368    mp_limb_t r0;
     369    mp_limb_t mask;
     370    mp_double_limb_t res;
     371  
     372    count_leading_zeros (ncnt, n0);
     373    count_leading_zeros (dcnt, d0);
     374  
     375    dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
     376    inv = tabp[dbi];
     377    q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
     378    r0 = n0 - q0 * d0;
     379    mask = -(mp_limb_t) (r0 >= d0);
     380    q0 -= mask;
     381    r0 -= d0 & mask;
     382  
     383    if (UNLIKELY (r0 >= d0))
     384      {
     385        q0 = n0 / d0;
     386        r0 = n0 - q0 * d0;
     387      }
     388  
     389    res.d1 = q0;
     390    res.d0 = r0;
     391    return res;
     392  }
     393  
     394  #else
     395  #error Unknown HGCD2_DIV1_METHOD
     396  #endif
     397  
     398  #if HAVE_NATIVE_mpn_div_22
     399  
     400  #define div2 mpn_div_22
     401  /* Two-limb division optimized for small quotients.  */
     402  mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
     403  
     404  #elif HGCD2_DIV2_METHOD == 1
     405  
     406  static mp_limb_t
     407  div2 (mp_ptr rp,
     408        mp_limb_t n1, mp_limb_t n0,
     409        mp_limb_t d1, mp_limb_t d0)
     410  {
     411    mp_double_limb_t rq = div1 (n1, d1);
     412    if (UNLIKELY (rq.d1 > d1))
     413      {
     414        mp_limb_t n2, q, t1, t0;
     415        int c;
     416  
     417        /* Normalize */
     418        count_leading_zeros (c, d1);
     419        ASSERT (c > 0);
     420  
     421        n2 = n1 >> (GMP_LIMB_BITS - c);
     422        n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
     423        n0 <<= c;
     424        d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
     425        d0 <<= c;
     426  
     427        udiv_qrnnd (q, n1, n2, n1, d1);
     428        umul_ppmm (t1, t0, q, d0);
     429        if (t1 > n1 || (t1 == n1 && t0 > n0))
     430  	{
     431  	  ASSERT (q > 0);
     432  	  q--;
     433  	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
     434  	}
     435        sub_ddmmss (n1, n0, n1, n0, t1, t0);
     436  
     437        /* Undo normalization */
     438        rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
     439        rp[1] = n1 >> c;
     440  
     441        return q;
     442      }
     443    else
     444      {
     445        mp_limb_t q, t1, t0;
     446        n1 = rq.d0;
     447        q = rq.d1;
     448        umul_ppmm (t1, t0, q, d0);
     449        if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
     450  	{
     451  	  ASSERT (q > 0);
     452  	  q--;
     453  	  sub_ddmmss (t1, t0, t1, t0, d1, d0);
     454  	}
     455        sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
     456        return q;
     457      }
     458  }
     459  
     460  #elif HGCD2_DIV2_METHOD == 2
     461  
     462  /* Bit-wise div2. Relies on fast count_leading_zeros. */
     463  static mp_limb_t
     464  div2 (mp_ptr rp,
     465        mp_limb_t n1, mp_limb_t n0,
     466        mp_limb_t d1, mp_limb_t d0)
     467  {
     468    mp_limb_t q = 0;
     469    int ncnt;
     470    int dcnt;
     471  
     472    count_leading_zeros (ncnt, n1);
     473    count_leading_zeros (dcnt, d1);
     474    dcnt -= ncnt;
     475  
     476    d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
     477    d0 <<= dcnt;
     478  
     479    do
     480      {
     481        mp_limb_t mask;
     482        q <<= 1;
     483        if (UNLIKELY (n1 == d1))
     484  	mask = -(n0 >= d0);
     485        else
     486  	mask = -(n1 > d1);
     487  
     488        q -= mask;
     489  
     490        sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
     491  
     492        d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
     493        d1 = d1 >> 1;
     494      }
     495    while (dcnt--);
     496  
     497    rp[0] = n0;
     498    rp[1] = n1;
     499  
     500    return q;
     501  }
     502  #else
     503  #error Unknown HGCD2_DIV2_METHOD
     504  #endif