1  /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
       2  
       3  Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
       4  Foundation, Inc.
       5  
       6  This file is part of the GNU MP Library.
       7  
       8  The GNU MP Library is free software; you can redistribute it and/or modify
       9  it under the terms of either:
      10  
      11    * the GNU Lesser General Public License as published by the Free
      12      Software Foundation; either version 3 of the License, or (at your
      13      option) any later version.
      14  
      15  or
      16  
      17    * the GNU General Public License as published by the Free Software
      18      Foundation; either version 2 of the License, or (at your option) any
      19      later version.
      20  
      21  or both in parallel, as here.
      22  
      23  The GNU MP Library is distributed in the hope that it will be useful, but
      24  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
      25  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      26  for more details.
      27  
      28  You should have received copies of the GNU General Public License and the
      29  GNU Lesser General Public License along with the GNU MP Library.  If not,
      30  see https://www.gnu.org/licenses/.  */
      31  
      32  /* You have to define the following before including this file:
      33  
      34     UWtype -- An unsigned type, default type for operations (typically a "word")
      35     UHWtype -- An unsigned type, at least half the size of UWtype
      36     UDWtype -- An unsigned type, at least twice as large a UWtype
      37     W_TYPE_SIZE -- size in bits of UWtype
      38  
      39     SItype, USItype -- Signed and unsigned 32 bit types
      40     DItype, UDItype -- Signed and unsigned 64 bit types
      41  
      42     On a 32 bit machine UWtype should typically be USItype;
      43     on a 64 bit machine, UWtype should typically be UDItype.
      44  
      45     Optionally, define:
      46  
      47     LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
      48     NO_ASM -- Disable inline asm
      49  
      50  
      51     CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
      52     need to include gmp.h and gmp-impl.h, or certain things might not work as
      53     expected.
      54  */
      55  
      56  #define __BITS4 (W_TYPE_SIZE / 4)
      57  #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
      58  #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
      59  #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
      60  
      61  /* This is used to make sure no undesirable sharing between different libraries
      62     that use this file takes place.  */
      63  #ifndef __MPN
      64  #define __MPN(x) __##x
      65  #endif
      66  
      67  /* Define auxiliary asm macros.
      68  
      69     1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
      70     UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
      71     word product in HIGH_PROD and LOW_PROD.
      72  
      73     2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
      74     UDWtype product.  This is just a variant of umul_ppmm.
      75  
      76     3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
      77     denominator) divides a UDWtype, composed by the UWtype integers
      78     HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
      79     in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
      80     than DENOMINATOR for correct operation.  If, in addition, the most
      81     significant bit of DENOMINATOR must be 1, then the pre-processor symbol
      82     UDIV_NEEDS_NORMALIZATION is defined to 1.
      83  
      84     4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
      85     denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
      86     is rounded towards 0.
      87  
      88     5) count_leading_zeros(count, x) counts the number of zero-bits from the
      89     msb to the first non-zero bit in the UWtype X.  This is the number of
      90     steps X needs to be shifted left to set the msb.  Undefined for X == 0,
      91     unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
      92  
      93     6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
      94     from the least significant end.
      95  
      96     7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
      97     high_addend_2, low_addend_2) adds two UWtype integers, composed by
      98     HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
      99     respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
     100     (i.e. carry out) is not stored anywhere, and is lost.
     101  
     102     8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
     103     high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
     104     composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
     105     LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
     106     and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
     107     and is lost.
     108  
     109     If any of these macros are left undefined for a particular CPU,
     110     C macros are used.
     111  
     112  
     113     Notes:
     114  
     115     For add_ssaaaa the two high and two low addends can both commute, but
     116     unfortunately gcc only supports one "%" commutative in each asm block.
     117     This has always been so but is only documented in recent versions
     118     (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
     119     compiler error in certain rare circumstances.
     120  
     121     Apparently it was only the last "%" that was ever actually respected, so
     122     the code has been updated to leave just that.  Clearly there's a free
     123     choice whether high or low should get it, if there's a reason to favour
     124     one over the other.  Also obviously when the constraints on the two
     125     operands are identical there's no benefit to the reloader in any "%" at
     126     all.
     127  
     128     */
     129  
     130  /* The CPUs come in alphabetical order below.
     131  
     132     Please add support for more CPUs here, or improve the current support
     133     for the CPUs below!  */
     134  
     135  
     136  /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
     137     3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
     138     Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
     139     __builtin_ctzll.
     140  
     141     These builtins are only used when we check what code comes out, on some
     142     chips they're merely libgcc calls, where we will instead want an inline
     143     in that case (either asm or generic C).
     144  
     145     These builtins are better than an asm block of the same insn, since an
     146     asm block doesn't give gcc any information about scheduling or resource
     147     usage.  We keep an asm block for use on prior versions of gcc though.
     148  
     149     For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
     150     it's not used (for count_leading_zeros) because it generally gives extra
     151     code to ensure the result is 0 when the input is 0, which we don't need
     152     or want.  */
     153  
     154  #ifdef _LONG_LONG_LIMB
     155  #define count_leading_zeros_gcc_clz(count,x)	\
     156    do {						\
     157      ASSERT ((x) != 0);				\
     158      (count) = __builtin_clzll (x);		\
     159    } while (0)
     160  #else
     161  #define count_leading_zeros_gcc_clz(count,x)	\
     162    do {						\
     163      ASSERT ((x) != 0);				\
     164      (count) = __builtin_clzl (x);		\
     165    } while (0)
     166  #endif
     167  
     168  #ifdef _LONG_LONG_LIMB
     169  #define count_trailing_zeros_gcc_ctz(count,x)	\
     170    do {						\
     171      ASSERT ((x) != 0);				\
     172      (count) = __builtin_ctzll (x);		\
     173    } while (0)
     174  #else
     175  #define count_trailing_zeros_gcc_ctz(count,x)	\
     176    do {						\
     177      ASSERT ((x) != 0);				\
     178      (count) = __builtin_ctzl (x);		\
     179    } while (0)
     180  #endif
     181  
     182  
     183  /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
     184     don't need to be under !NO_ASM */
     185  #if ! defined (NO_ASM)
     186  
     187  #if defined (__alpha) && W_TYPE_SIZE == 64
     188  /* Most alpha-based machines, except Cray systems. */
     189  #if defined (__GNUC__)
     190  #if __GMP_GNUC_PREREQ (3,3)
     191  #define umul_ppmm(ph, pl, m0, m1) \
     192    do {									\
     193      UDItype __m0 = (m0), __m1 = (m1);					\
     194      (ph) = __builtin_alpha_umulh (__m0, __m1);				\
     195      (pl) = __m0 * __m1;							\
     196    } while (0)
     197  #else
     198  #define umul_ppmm(ph, pl, m0, m1) \
     199    do {									\
     200      UDItype __m0 = (m0), __m1 = (m1);					\
     201      __asm__ ("umulh %r1,%2,%0"						\
     202  	     : "=r" (ph)						\
     203  	     : "%rJ" (__m0), "rI" (__m1));				\
     204      (pl) = __m0 * __m1;							\
     205    } while (0)
     206  #endif
     207  #else /* ! __GNUC__ */
     208  #include <machine/builtins.h>
     209  #define umul_ppmm(ph, pl, m0, m1) \
     210    do {									\
     211      UDItype __m0 = (m0), __m1 = (m1);					\
     212      (ph) = __UMULH (__m0, __m1);					\
     213      (pl) = __m0 * __m1;							\
     214    } while (0)
     215  #endif
     216  #ifndef LONGLONG_STANDALONE
     217  #define udiv_qrnnd(q, r, n1, n0, d) \
     218    do { UWtype __di;							\
     219      __di = __MPN(invert_limb) (d);					\
     220      udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
     221    } while (0)
     222  #define UDIV_PREINV_ALWAYS  1
     223  #define UDIV_NEEDS_NORMALIZATION 1
     224  #endif /* LONGLONG_STANDALONE */
     225  
     226  /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
     227     always goes into libgmp.so, even when not actually used.  */
     228  #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
     229  
     230  #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
     231  #define count_leading_zeros(COUNT,X) \
     232    __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
     233  #define count_trailing_zeros(COUNT,X) \
     234    __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
     235  #endif /* clz/ctz using cix */
     236  
     237  #if ! defined (count_leading_zeros)				\
     238    && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
     239  /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
     240     "$31" is written explicitly in the asm, since an "r" constraint won't
     241     select reg 31.  There seems no need to worry about "r31" syntax for cray,
     242     since gcc itself (pre-release 3.4) emits just $31 in various places.	 */
     243  #define ALPHA_CMPBGE_0(dst, src)					\
     244    do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
     245  /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
     246     them, locating the highest non-zero byte.  A second __clz_tab lookup
     247     counts the leading zero bits in that byte, giving the result.  */
     248  #define count_leading_zeros(count, x)					\
     249    do {									\
     250      UWtype  __clz__b, __clz__c, __clz__x = (x);				\
     251      ALPHA_CMPBGE_0 (__clz__b,  __clz__x);	    /* zero bytes */	\
     252      __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */	\
     253      __clz__b = __clz__b * 8 - 7;		    /* 57 to 1 shift */ \
     254      __clz__x >>= __clz__b;						\
     255      __clz__c = __clz_tab [__clz__x];		    /* 8 to 1 bit */	\
     256      __clz__b = 65 - __clz__b;						\
     257      (count) = __clz__b - __clz__c;					\
     258    } while (0)
     259  #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
     260  #endif /* clz using cmpbge */
     261  
     262  #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
     263  #if HAVE_ATTRIBUTE_CONST
     264  long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
     265  #else
     266  long __MPN(count_leading_zeros) (UDItype);
     267  #endif
     268  #define count_leading_zeros(count, x) \
     269    ((count) = __MPN(count_leading_zeros) (x))
     270  #endif /* clz using mpn */
     271  #endif /* __alpha */
     272  
     273  #if defined (__AVR) && W_TYPE_SIZE == 8
     274  #define umul_ppmm(ph, pl, m0, m1) \
     275    do {									\
     276      unsigned short __p = (unsigned short) (m0) * (m1);			\
     277      (ph) = __p >> 8;							\
     278      (pl) = __p;								\
     279    } while (0)
     280  #endif /* AVR */
     281  
     282  #if defined (_CRAY) && W_TYPE_SIZE == 64
     283  #include <intrinsics.h>
     284  #define UDIV_PREINV_ALWAYS  1
     285  #define UDIV_NEEDS_NORMALIZATION 1
     286  long __MPN(count_leading_zeros) (UDItype);
     287  #define count_leading_zeros(count, x) \
     288    ((count) = _leadz ((UWtype) (x)))
     289  #if defined (_CRAYIEEE)		/* I.e., Cray T90/ieee, T3D, and T3E */
     290  #define umul_ppmm(ph, pl, m0, m1) \
     291    do {									\
     292      UDItype __m0 = (m0), __m1 = (m1);					\
     293      (ph) = _int_mult_upper (__m0, __m1);				\
     294      (pl) = __m0 * __m1;							\
     295    } while (0)
     296  #ifndef LONGLONG_STANDALONE
     297  #define udiv_qrnnd(q, r, n1, n0, d) \
     298    do { UWtype __di;							\
     299      __di = __MPN(invert_limb) (d);					\
     300      udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
     301    } while (0)
     302  #endif /* LONGLONG_STANDALONE */
     303  #endif /* _CRAYIEEE */
     304  #endif /* _CRAY */
     305  
     306  #if defined (__ia64) && W_TYPE_SIZE == 64
     307  /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
     308     "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
     309     code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
     310     register, which takes an extra cycle.  */
     311  #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
     312    do {						\
     313      UWtype __x;					\
     314      __x = (al) - (bl);				\
     315      if ((al) < (bl))				\
     316        (sh) = (ah) - (bh) - 1;			\
     317      else					\
     318        (sh) = (ah) - (bh);			\
     319      (sl) = __x;					\
     320    } while (0)
     321  #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
     322  /* Do both product parts in assembly, since that gives better code with
     323     all gcc versions.  Some callers will just use the upper part, and in
     324     that situation we waste an instruction, but not any cycles.  */
     325  #define umul_ppmm(ph, pl, m0, m1) \
     326      __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"		\
     327  	     : "=&f" (ph), "=f" (pl)					\
     328  	     : "f" (m0), "f" (m1))
     329  #define count_leading_zeros(count, x) \
     330    do {									\
     331      UWtype _x = (x), _y, _a, _c;					\
     332      __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));		\
     333      __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));		\
     334      _c = (_a - 1) << 3;							\
     335      _x >>= _c;								\
     336      if (_x >= 1 << 4)							\
     337        _x >>= 4, _c += 4;						\
     338      if (_x >= 1 << 2)							\
     339        _x >>= 2, _c += 2;						\
     340      _c += _x >> 1;							\
     341      (count) =  W_TYPE_SIZE - 1 - _c;					\
     342    } while (0)
     343  /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
     344     based, and we don't need a special case for x==0 here */
     345  #define count_trailing_zeros(count, x)					\
     346    do {									\
     347      UWtype __ctz_x = (x);						\
     348      __asm__ ("popcnt %0 = %1"						\
     349  	     : "=r" (count)						\
     350  	     : "r" ((__ctz_x-1) & ~__ctz_x));				\
     351    } while (0)
     352  #endif
     353  #if defined (__INTEL_COMPILER)
     354  #include <ia64intrin.h>
     355  #define umul_ppmm(ph, pl, m0, m1)					\
     356    do {									\
     357      UWtype __m0 = (m0), __m1 = (m1);					\
     358      ph = _m64_xmahu (__m0, __m1, 0);					\
     359      pl = __m0 * __m1;							\
     360    } while (0)
     361  #endif
     362  #ifndef LONGLONG_STANDALONE
     363  #define udiv_qrnnd(q, r, n1, n0, d) \
     364    do { UWtype __di;							\
     365      __di = __MPN(invert_limb) (d);					\
     366      udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
     367    } while (0)
     368  #define UDIV_PREINV_ALWAYS  1
     369  #define UDIV_NEEDS_NORMALIZATION 1
     370  #endif
     371  #endif
     372  
     373  
     374  #if defined (__GNUC__)
     375  
     376  /* We sometimes need to clobber "cc" with gcc2, but that would not be
     377     understood by gcc1.  Use cpp to avoid major code duplication.  */
     378  #if __GNUC__ < 2
     379  #define __CLOBBER_CC
     380  #define __AND_CLOBBER_CC
     381  #else /* __GNUC__ >= 2 */
     382  #define __CLOBBER_CC : "cc"
     383  #define __AND_CLOBBER_CC , "cc"
     384  #endif /* __GNUC__ < 2 */
     385  
     386  #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
     387  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     388    __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"				\
     389  	   : "=r" (sh), "=&r" (sl)					\
     390  	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
     391  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     392    __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"				\
     393  	   : "=r" (sh), "=&r" (sl)					\
     394  	   : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
     395  #define umul_ppmm(xh, xl, m0, m1) \
     396    do {									\
     397      USItype __m0 = (m0), __m1 = (m1);					\
     398      __asm__ ("multiplu %0,%1,%2"					\
     399  	     : "=r" (xl)						\
     400  	     : "r" (__m0), "r" (__m1));					\
     401      __asm__ ("multmu %0,%1,%2"						\
     402  	     : "=r" (xh)						\
     403  	     : "r" (__m0), "r" (__m1));					\
     404    } while (0)
     405  #define udiv_qrnnd(q, r, n1, n0, d) \
     406    __asm__ ("dividu %0,%3,%4"						\
     407  	   : "=r" (q), "=q" (r)						\
     408  	   : "1" (n1), "r" (n0), "r" (d))
     409  #define count_leading_zeros(count, x) \
     410      __asm__ ("clz %0,%1"						\
     411  	     : "=r" (count)						\
     412  	     : "r" (x))
     413  #define COUNT_LEADING_ZEROS_0 32
     414  #endif /* __a29k__ */
     415  
     416  #if defined (__arc__)
     417  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     418    __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
     419  	   : "=r" (sh),							\
     420  	     "=&r" (sl)							\
     421  	   : "r"  ((USItype) (ah)),					\
     422  	     "rICal" ((USItype) (bh)),					\
     423  	     "%r" ((USItype) (al)),					\
     424  	     "rICal" ((USItype) (bl)))
     425  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     426    __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
     427  	   : "=r" (sh),							\
     428  	     "=&r" (sl)							\
     429  	   : "r" ((USItype) (ah)),					\
     430  	     "rICal" ((USItype) (bh)),					\
     431  	     "r" ((USItype) (al)),					\
     432  	     "rICal" ((USItype) (bl)))
     433  #endif
     434  
     435  #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
     436      && W_TYPE_SIZE == 32
     437  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     438    do {									\
     439      if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl))	\
     440        __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
     441  	   : "=r" (sh), "=&r" (sl)					\
     442  	       : "r" (ah), "rI" (bh),					\
     443  		 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC);	\
     444      else								\
     445        __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"			\
     446  	   : "=r" (sh), "=&r" (sl)					\
     447  	   : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC);	\
     448    } while (0)
     449  /* FIXME: Extend the immediate range for the low word by using both ADDS and
     450     SUBS, since they set carry in the same way.  We need separate definitions
     451     for thumb and non-thumb since thumb lacks RSC.  */
     452  #if defined (__thumb__)
     453  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     454    do {									\
     455      if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
     456  	&& (ah) == (bh))						\
     457        __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
     458  	       : "=r" (sh), "=r" (sl)					\
     459  	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
     460      else if (__builtin_constant_p (al))					\
     461        __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"			\
     462  	       : "=r" (sh), "=&r" (sl)					\
     463  	       : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
     464      else								\
     465        __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
     466  	       : "=r" (sh), "=&r" (sl)					\
     467  	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
     468      } while (0)
     469  #else
     470  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     471    do {									\
     472      if (__builtin_constant_p (ah) && __builtin_constant_p (bh)		\
     473  	&& (ah) == (bh))						\
     474        __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0"			\
     475  	       : "=r" (sh), "=r" (sl)					\
     476  	       : "r" (al), "rI" (bl) __CLOBBER_CC);			\
     477      else if (__builtin_constant_p (al))					\
     478        {									\
     479  	if (__builtin_constant_p (ah))					\
     480  	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
     481  		   : "=r" (sh), "=&r" (sl)				\
     482  		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
     483  	else								\
     484  	  __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"		\
     485  		   : "=r" (sh), "=&r" (sl)				\
     486  		   : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
     487        }									\
     488      else if (__builtin_constant_p (ah))					\
     489        {									\
     490  	if (__builtin_constant_p (bl))					\
     491  	  __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"		\
     492  		   : "=r" (sh), "=&r" (sl)				\
     493  		   : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
     494  	else								\
     495  	  __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"		\
     496  		   : "=r" (sh), "=&r" (sl)				\
     497  		   : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
     498        }									\
     499      else								\
     500        __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"			\
     501  	       : "=r" (sh), "=&r" (sl)					\
     502  	       : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
     503      } while (0)
     504  #endif
     505  #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
     506      || defined (__ARM_ARCH_3__)
     507  #define umul_ppmm(xh, xl, a, b)						\
     508    do {									\
     509      register USItype __t0, __t1, __t2;					\
     510      __asm__ ("%@ Inlined umul_ppmm\n"					\
     511  	   "	mov	%2, %5, lsr #16\n"				\
     512  	   "	mov	%0, %6, lsr #16\n"				\
     513  	   "	bic	%3, %5, %2, lsl #16\n"				\
     514  	   "	bic	%4, %6, %0, lsl #16\n"				\
     515  	   "	mul	%1, %3, %4\n"					\
     516  	   "	mul	%4, %2, %4\n"					\
     517  	   "	mul	%3, %0, %3\n"					\
     518  	   "	mul	%0, %2, %0\n"					\
     519  	   "	adds	%3, %4, %3\n"					\
     520  	   "	addcs	%0, %0, #65536\n"				\
     521  	   "	adds	%1, %1, %3, lsl #16\n"				\
     522  	   "	adc	%0, %0, %3, lsr #16"				\
     523  	   : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)),		\
     524  	     "=&r" (__t0), "=&r" (__t1), "=r" (__t2)			\
     525  	   : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC);	\
     526    } while (0)
     527  #ifndef LONGLONG_STANDALONE
     528  #define udiv_qrnnd(q, r, n1, n0, d) \
     529    do { UWtype __r;							\
     530      (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
     531      (r) = __r;								\
     532    } while (0)
     533  extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
     534  #endif /* LONGLONG_STANDALONE */
     535  #else /* ARMv4 or newer */
     536  #define umul_ppmm(xh, xl, a, b) \
     537    __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
     538  #define smul_ppmm(xh, xl, a, b) \
     539    __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
     540  #ifndef LONGLONG_STANDALONE
     541  #define udiv_qrnnd(q, r, n1, n0, d) \
     542    do { UWtype __di;							\
     543      __di = __MPN(invert_limb) (d);					\
     544      udiv_qrnnd_preinv (q, r, n1, n0, d, __di);				\
     545    } while (0)
     546  #define UDIV_PREINV_ALWAYS  1
     547  #define UDIV_NEEDS_NORMALIZATION 1
     548  #endif /* LONGLONG_STANDALONE */
     549  #endif /* defined(__ARM_ARCH_2__) ... */
     550  #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
     551  #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
     552  #endif /* __arm__ */
     553  
     554  #if defined (__aarch64__) && W_TYPE_SIZE == 64
     555  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     556    do {									\
     557      if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
     558        __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
     559  	       : "=r" (sh), "=&r" (sl)					\
     560  	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
     561  		 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
     562      else								\
     563        __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"			\
     564  	       : "=r" (sh), "=&r" (sl)					\
     565  	       : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)),		\
     566  		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
     567    } while (0)
     568  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     569    do {									\
     570      if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl))	\
     571        __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
     572  	       : "=r,r" (sh), "=&r,&r" (sl)				\
     573  	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
     574  		 "r,Z"   ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
     575      else								\
     576        __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"			\
     577  	       : "=r,r" (sh), "=&r,&r" (sl)				\
     578  	       : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)),	\
     579  		 "r,Z"   ((UDItype)(al)), "rI,r"  ((UDItype)(bl)) __CLOBBER_CC);\
     580    } while(0);
     581  #if __GMP_GNUC_PREREQ (4,9)
     582  #define umul_ppmm(w1, w0, u, v) \
     583    do {									\
     584      typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
     585      __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
     586      w1 = __ll >> 64;							\
     587      w0 = __ll;								\
     588    } while (0)
     589  #endif
     590  #if !defined (umul_ppmm)
     591  #define umul_ppmm(ph, pl, m0, m1) \
     592    do {									\
     593      UDItype __m0 = (m0), __m1 = (m1);					\
     594      __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1));	\
     595      (pl) = __m0 * __m1;							\
     596    } while (0)
     597  #endif
     598  #define count_leading_zeros(count, x)  count_leading_zeros_gcc_clz(count, x)
     599  #define count_trailing_zeros(count, x)  count_trailing_zeros_gcc_ctz(count, x)
     600  #endif /* __aarch64__ */
     601  
     602  #if defined (__clipper__) && W_TYPE_SIZE == 32
     603  #define umul_ppmm(w1, w0, u, v) \
     604    ({union {UDItype __ll;						\
     605  	   struct {USItype __l, __h;} __i;				\
     606  	  } __x;							\
     607    __asm__ ("mulwux %2,%0"						\
     608  	   : "=r" (__x.__ll)						\
     609  	   : "%0" ((USItype)(u)), "r" ((USItype)(v)));			\
     610    (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
     611  #define smul_ppmm(w1, w0, u, v) \
     612    ({union {DItype __ll;							\
     613  	   struct {SItype __l, __h;} __i;				\
     614  	  } __x;							\
     615    __asm__ ("mulwx %2,%0"						\
     616  	   : "=r" (__x.__ll)						\
     617  	   : "%0" ((SItype)(u)), "r" ((SItype)(v)));			\
     618    (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
     619  #define __umulsidi3(u, v) \
     620    ({UDItype __w;							\
     621      __asm__ ("mulwux %2,%0"						\
     622  	     : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));	\
     623      __w; })
     624  #endif /* __clipper__ */
     625  
     626  /* Fujitsu vector computers.  */
     627  #if defined (__uxp__) && W_TYPE_SIZE == 32
     628  #define umul_ppmm(ph, pl, u, v) \
     629    do {									\
     630      union {UDItype __ll;						\
     631  	   struct {USItype __h, __l;} __i;				\
     632  	  } __x;							\
     633      __asm__ ("mult.lu %1,%2,%0"	: "=r" (__x.__ll) : "%r" (u), "rK" (v));\
     634      (ph) = __x.__i.__h;							\
     635      (pl) = __x.__i.__l;							\
     636    } while (0)
     637  #define smul_ppmm(ph, pl, u, v) \
     638    do {									\
     639      union {UDItype __ll;						\
     640  	   struct {USItype __h, __l;} __i;				\
     641  	  } __x;							\
     642      __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));	\
     643      (ph) = __x.__i.__h;							\
     644      (pl) = __x.__i.__l;							\
     645    } while (0)
     646  #endif
     647  
     648  #if defined (__gmicro__) && W_TYPE_SIZE == 32
     649  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     650    __asm__ ("add.w %5,%1\n\taddx %3,%0"					\
     651  	   : "=g" (sh), "=&g" (sl)					\
     652  	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
     653  	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
     654  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     655    __asm__ ("sub.w %5,%1\n\tsubx %3,%0"					\
     656  	   : "=g" (sh), "=&g" (sl)					\
     657  	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
     658  	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
     659  #define umul_ppmm(ph, pl, m0, m1) \
     660    __asm__ ("mulx %3,%0,%1"						\
     661  	   : "=g" (ph), "=r" (pl)					\
     662  	   : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
     663  #define udiv_qrnnd(q, r, nh, nl, d) \
     664    __asm__ ("divx %4,%0,%1"						\
     665  	   : "=g" (q), "=r" (r)						\
     666  	   : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
     667  #define count_leading_zeros(count, x) \
     668    __asm__ ("bsch/1 %1,%0"						\
     669  	   : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
     670  #endif
     671  
     672  #if defined (__hppa) && W_TYPE_SIZE == 32
     673  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     674    __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"			\
     675  	   : "=r" (sh), "=&r" (sl)					\
     676  	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
     677  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     678    __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"			\
     679  	   : "=r" (sh), "=&r" (sl)					\
     680  	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
     681  #if defined (_PA_RISC1_1)
     682  #define umul_ppmm(wh, wl, u, v) \
     683    do {									\
     684      union {UDItype __ll;						\
     685  	   struct {USItype __h, __l;} __i;				\
     686  	  } __x;							\
     687      __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v));	\
     688      (wh) = __x.__i.__h;							\
     689      (wl) = __x.__i.__l;							\
     690    } while (0)
     691  #endif
     692  #define count_leading_zeros(count, x) \
     693    do {									\
     694      USItype __tmp;							\
     695      __asm__ (								\
     696         "ldi		1,%0\n"						\
     697  "	extru,=		%1,15,16,%%r0	; Bits 31..16 zero?\n"		\
     698  "	extru,tr	%1,15,16,%1	; No.  Shift down, skip add.\n"	\
     699  "	ldo		16(%0),%0	; Yes.  Perform add.\n"		\
     700  "	extru,=		%1,23,8,%%r0	; Bits 15..8 zero?\n"		\
     701  "	extru,tr	%1,23,8,%1	; No.  Shift down, skip add.\n"	\
     702  "	ldo		8(%0),%0	; Yes.  Perform add.\n"		\
     703  "	extru,=		%1,27,4,%%r0	; Bits 7..4 zero?\n"		\
     704  "	extru,tr	%1,27,4,%1	; No.  Shift down, skip add.\n"	\
     705  "	ldo		4(%0),%0	; Yes.  Perform add.\n"		\
     706  "	extru,=		%1,29,2,%%r0	; Bits 3..2 zero?\n"		\
     707  "	extru,tr	%1,29,2,%1	; No.  Shift down, skip add.\n"	\
     708  "	ldo		2(%0),%0	; Yes.  Perform add.\n"		\
     709  "	extru		%1,30,1,%1	; Extract bit 1.\n"		\
     710  "	sub		%0,%1,%0	; Subtract it.\n"		\
     711  	: "=r" (count), "=r" (__tmp) : "1" (x));			\
     712    } while (0)
     713  #endif /* hppa */
     714  
     715  /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
     716     (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
     717     is just a case of no direct support for 2.0n but treating it like 1.0. */
     718  #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
     719  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     720    __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"			\
     721  	   : "=r" (sh), "=&r" (sl)					\
     722  	   : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
     723  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     724    __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"			\
     725  	   : "=r" (sh), "=&r" (sl)					\
     726  	   : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
     727  #endif /* hppa */
     728  
     729  #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
     730  #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
     731  #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
     732    do {									\
     733  /*  if (__builtin_constant_p (bl))					\
     734        __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"				\
     735  	       : "=r" (sh), "=&r" (sl)					\
     736  	       : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
     737      else								\
     738  */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"				\
     739  	       : "=r" (sh), "=&r" (sl)					\
     740  	       : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC);	\
     741    } while (0)
     742  #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
     743    do {									\
     744  /*  if (__builtin_constant_p (bl))					\
     745        __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"				\
     746  	       : "=r" (sh), "=&r" (sl)					\
     747  	       : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);	\
     748      else								\
     749  */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"				\
     750  	       : "=r" (sh), "=&r" (sl)					\
     751  	       : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);	\
     752    } while (0)
     753  #if __GMP_GNUC_PREREQ (4,5)
     754  #define umul_ppmm(xh, xl, m0, m1)					\
     755    do {									\
     756      union {UDItype __ll;						\
     757  	   struct {USItype __h, __l;} __i;				\
     758  	  } __x;							\
     759      __x.__ll = (UDItype) (m0) * (UDItype) (m1);				\
     760      (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
     761    } while (0)
     762  #else
     763  #if 0
     764  /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
     765     with a new enough processor pretending we have 32-bit registers.  */
     766  #define umul_ppmm(xh, xl, m0, m1)					\
     767    do {									\
     768      union {UDItype __ll;						\
     769  	   struct {USItype __h, __l;} __i;				\
     770  	  } __x;							\
     771      __asm__ ("mlr\t%0,%2"						\
     772  	     : "=r" (__x.__ll)						\
     773  	     : "%0" (m0), "r" (m1));					\
     774      (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
     775    } while (0)
     776  #else
     777  #define umul_ppmm(xh, xl, m0, m1)					\
     778    do {									\
     779    /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
     780       DImode for the product, since that would be allocated to a single 64-bit
     781       register, whereas mlr uses the low 32-bits of an even-odd register pair.
     782    */									\
     783      register USItype __r0 __asm__ ("0");				\
     784      register USItype __r1 __asm__ ("1") = (m0);				\
     785      __asm__ ("mlr\t%0,%3"						\
     786  	     : "=r" (__r0), "=r" (__r1)					\
     787  	     : "r" (__r1), "r" (m1));					\
     788      (xh) = __r0; (xl) = __r1;						\
     789    } while (0)
     790  #endif /* if 0 */
     791  #endif
     792  #if 0
     793  /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
     794     with a new enough processor pretending we have 32-bit registers.  */
     795  #define udiv_qrnnd(q, r, n1, n0, d)					\
     796    do {									\
     797      union {UDItype __ll;						\
     798  	   struct {USItype __h, __l;} __i;				\
     799  	  } __x;							\
     800      __x.__i.__h = n1; __x.__i.__l = n0;					\
     801      __asm__ ("dlr\t%0,%2"						\
     802  	     : "=r" (__x.__ll)						\
     803  	     : "0" (__x.__ll), "r" (d));				\
     804      (q) = __x.__i.__l; (r) = __x.__i.__h;				\
     805    } while (0)
     806  #else
     807  #define udiv_qrnnd(q, r, n1, n0, d)					\
     808    do {									\
     809      register USItype __r0 __asm__ ("0") = (n1);				\
     810      register USItype __r1 __asm__ ("1") = (n0);				\
     811      __asm__ ("dlr\t%0,%4"						\
     812  	     : "=r" (__r0), "=r" (__r1)					\
     813  	     : "r" (__r0), "r" (__r1), "r" (d));			\
     814      (q) = __r1; (r) = __r0;						\
     815    } while (0)
     816  #endif /* if 0 */
     817  #else /* if __zarch__ */
     818  /* FIXME: this fails if gcc knows about the 64-bit registers.  */
     819  #define smul_ppmm(xh, xl, m0, m1)					\
     820    do {									\
     821      union {DItype __ll;							\
     822  	   struct {USItype __h, __l;} __i;				\
     823  	  } __x;							\
     824      __asm__ ("mr\t%0,%2"						\
     825  	     : "=r" (__x.__ll)						\
     826  	     : "%0" (m0), "r" (m1));					\
     827      (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
     828    } while (0)
     829  /* FIXME: this fails if gcc knows about the 64-bit registers.  */
     830  #define sdiv_qrnnd(q, r, n1, n0, d)					\
     831    do {									\
     832      union {DItype __ll;							\
     833  	   struct {USItype __h, __l;} __i;				\
     834  	  } __x;							\
     835      __x.__i.__h = n1; __x.__i.__l = n0;					\
     836      __asm__ ("dr\t%0,%2"						\
     837  	     : "=r" (__x.__ll)						\
     838  	     : "0" (__x.__ll), "r" (d));				\
     839      (q) = __x.__i.__l; (r) = __x.__i.__h;				\
     840    } while (0)
     841  #endif /* if __zarch__ */
     842  #endif
     843  
     844  #if defined (__s390x__) && W_TYPE_SIZE == 64
     845  /* We need to cast operands with register constraints, otherwise their types
     846     will be assumed to be SImode by gcc.  For these machines, such operations
     847     will insert a value into the low 32 bits, and leave the high 32 bits with
     848     garbage.  */
     849  #define add_ssaaaa(sh, sl, ah, al, bh, bl)				\
     850    do {									\
     851      __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"				\
     852  	       : "=r" (sh), "=&r" (sl)					\
     853  	       : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
     854  		 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
     855    } while (0)
     856  #define sub_ddmmss(sh, sl, ah, al, bh, bl)				\
     857    do {									\
     858      __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"				\
     859  	     : "=r" (sh), "=&r" (sl)					\
     860  	     : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
     861  	       "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);	\
     862    } while (0)
     863  #if !defined (__clang__)
     864  #define umul_ppmm(xh, xl, m0, m1)					\
     865    do {									\
     866      union {unsigned int __attribute__ ((mode(TI))) __ll;		\
     867  	   struct {UDItype __h, __l;} __i;				\
     868  	  } __x;							\
     869      __asm__ ("mlgr\t%0,%2"						\
     870  	     : "=r" (__x.__ll)						\
     871  	     : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));		\
     872      (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
     873    } while (0)
     874  #define udiv_qrnnd(q, r, n1, n0, d)					\
     875    do {									\
     876      union {unsigned int __attribute__ ((mode(TI))) __ll;		\
     877  	   struct {UDItype __h, __l;} __i;				\
     878  	  } __x;							\
     879      __x.__i.__h = n1; __x.__i.__l = n0;					\
     880      __asm__ ("dlgr\t%0,%2"						\
     881  	     : "=r" (__x.__ll)						\
     882  	     : "0" (__x.__ll), "r" ((UDItype)(d)));			\
     883      (q) = __x.__i.__l; (r) = __x.__i.__h;				\
     884    } while (0)
     885  #endif
     886  #if 0 /* FIXME: Enable for z10 (?) */
     887  #define count_leading_zeros(cnt, x)					\
     888    do {									\
     889      union {unsigned int __attribute__ ((mode(TI))) __ll;		\
     890  	   struct {UDItype __h, __l;} __i;				\
     891  	  } __clr_cnt;							\
     892      __asm__ ("flogr\t%0,%1"						\
     893  	     : "=r" (__clr_cnt.__ll)					\
     894  	     : "r" (x) __CLOBBER_CC);					\
     895      (cnt) = __clr_cnt.__i.__h;						\
     896    } while (0)
     897  #endif
     898  #endif
     899  
     900  /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
     901     so we don't need __CLOBBER_CC.  */
     902  #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
     903  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
     904    __asm__ ("addl %5,%k1\n\tadcl %3,%k0"					\
     905  	   : "=r" (sh), "=&r" (sl)					\
     906  	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
     907  	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
     908  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
     909    __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"					\
     910  	   : "=r" (sh), "=&r" (sl)					\
     911  	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
     912  	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
     913  #define umul_ppmm(w1, w0, u, v) \
     914    __asm__ ("mull %3"							\
     915  	   : "=a" (w0), "=d" (w1)					\
     916  	   : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
     917  #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
     918    __asm__ ("divl %4"		     /* stringification in K&R C */	\
     919  	   : "=a" (q), "=d" (r)						\
     920  	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
     921  
     922  #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
     923  /* Pentium bsrl takes between 10 and 72 cycles depending where the most
     924     significant 1 bit is, hence the use of the following alternatives.  bsfl
     925     is slow too, between 18 and 42 depending where the least significant 1
     926     bit is, so let the generic count_trailing_zeros below make use of the
     927     count_leading_zeros here too.  */
     928  
     929  #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
     930  /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
     931     cache miss reading from __clz_tab.  For P55 it's favoured over the float
     932     below so as to avoid mixing MMX and x87, since the penalty for switching
     933     between the two is about 100 cycles.
     934  
     935     The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
     936     16, -1 for 8, or 0 otherwise.  This could be written equivalently as
     937     follows, but as of gcc 2.95.2 it results in conditional jumps.
     938  
     939         __shift = -(__n < 0x1000000);
     940         __shift -= (__n < 0x10000);
     941         __shift -= (__n < 0x100);
     942  
     943     The middle two sbbl and cmpl's pair, and with luck something gcc
     944     generates might pair with the first cmpl and the last sbbl.  The "32+1"
     945     constant could be folded into __clz_tab[], but it doesn't seem worth
     946     making a different table just for that.  */
     947  
     948  #define count_leading_zeros(c,n)					\
     949    do {									\
     950      USItype  __n = (n);							\
     951      USItype  __shift;							\
     952      __asm__ ("cmpl  $0x1000000, %1\n"					\
     953  	     "sbbl  %0, %0\n"						\
     954  	     "cmpl  $0x10000, %1\n"					\
     955  	     "sbbl  $0, %0\n"						\
     956  	     "cmpl  $0x100, %1\n"					\
     957  	     "sbbl  $0, %0\n"						\
     958  	     : "=&r" (__shift) : "r"  (__n));				\
     959      __shift = __shift*8 + 24 + 1;					\
     960      (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];			\
     961    } while (0)
     962  #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
     963  #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
     964  
     965  #else /* ! pentiummmx || LONGLONG_STANDALONE */
     966  /* The following should be a fixed 14 cycles or so.  Some scheduling
     967     opportunities should be available between the float load/store too.  This
     968     sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
     969     apparently suggested by the Intel optimizing manual (don't know exactly
     970     where).  gcc 2.95 or up will be best for this, so the "double" is
     971     correctly aligned on the stack.  */
     972  #define count_leading_zeros(c,n)					\
     973    do {									\
     974      union {								\
     975        double    d;							\
     976        unsigned  a[2];							\
     977      } __u;								\
     978      __u.d = (UWtype) (n);						\
     979      (c) = 0x3FF + 31 - (__u.a[1] >> 20);				\
     980    } while (0)
     981  #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
     982  #endif /* pentiummx */
     983  
     984  #else /* ! pentium */
     985  
     986  #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
     987  #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
     988  #endif /* gcc clz */
     989  
     990  /* On P6, gcc prior to 3.0 generates a partial register stall for
     991     __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
     992     being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
     993     cost of one extra instruction.  Do this for "i386" too, since that means
     994     generic x86.  */
     995  #if ! defined (count_leading_zeros) && __GNUC__ < 3			\
     996    && (HAVE_HOST_CPU_i386						\
     997        || HAVE_HOST_CPU_i686						\
     998        || HAVE_HOST_CPU_pentiumpro					\
     999        || HAVE_HOST_CPU_pentium2						\
    1000        || HAVE_HOST_CPU_pentium3)
    1001  #define count_leading_zeros(count, x)					\
    1002    do {									\
    1003      USItype __cbtmp;							\
    1004      ASSERT ((x) != 0);							\
    1005      __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
    1006      (count) = 31 - __cbtmp;						\
    1007    } while (0)
    1008  #endif /* gcc<3 asm bsrl */
    1009  
    1010  #ifndef count_leading_zeros
    1011  #define count_leading_zeros(count, x)					\
    1012    do {									\
    1013      USItype __cbtmp;							\
    1014      ASSERT ((x) != 0);							\
    1015      __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));	\
    1016      (count) = __cbtmp ^ 31;						\
    1017    } while (0)
    1018  #endif /* asm bsrl */
    1019  
    1020  #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
    1021  #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
    1022  #endif /* gcc ctz */
    1023  
    1024  #ifndef count_trailing_zeros
    1025  #define count_trailing_zeros(count, x)					\
    1026    do {									\
    1027      ASSERT ((x) != 0);							\
    1028      __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));	\
    1029    } while (0)
    1030  #endif /* asm bsfl */
    1031  
    1032  #endif /* ! pentium */
    1033  
    1034  #endif /* 80x86 */
    1035  
    1036  #if defined (__amd64__) && W_TYPE_SIZE == 64
    1037  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1038    __asm__ ("addq %5,%q1\n\tadcq %3,%q0"					\
    1039  	   : "=r" (sh), "=&r" (sl)					\
    1040  	   : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
    1041  	     "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
    1042  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1043    __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"					\
    1044  	   : "=r" (sh), "=&r" (sl)					\
    1045  	   : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),		\
    1046  	     "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
    1047  #if X86_ASM_MULX \
    1048     && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
    1049         || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
    1050  #define umul_ppmm(w1, w0, u, v) \
    1051    __asm__ ("mulx\t%3, %q0, %q1"						\
    1052  	   : "=r" (w0), "=r" (w1)					\
    1053  	   : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
    1054  #else
    1055  #define umul_ppmm(w1, w0, u, v) \
    1056    __asm__ ("mulq\t%3"							\
    1057  	   : "=a" (w0), "=d" (w1)					\
    1058  	   : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
    1059  #endif
    1060  #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
    1061    __asm__ ("divq %4"		     /* stringification in K&R C */	\
    1062  	   : "=a" (q), "=d" (r)						\
    1063  	   : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
    1064  
    1065  #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
    1066    || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2	\
    1067    || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen	\
    1068    || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
    1069  #define count_leading_zeros(count, x)					\
    1070    do {									\
    1071      /* This is lzcnt, spelled for older assemblers.  Destination and */	\
    1072      /* source must be a 64-bit registers, hence cast and %q.         */	\
    1073      __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
    1074    } while (0)
    1075  #define COUNT_LEADING_ZEROS_0 64
    1076  #else
    1077  #define count_leading_zeros(count, x)					\
    1078    do {									\
    1079      UDItype __cbtmp;							\
    1080      ASSERT ((x) != 0);							\
    1081      __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));	\
    1082      (count) = __cbtmp ^ 63;						\
    1083    } while (0)
    1084  #endif
    1085  
    1086  #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
    1087    || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
    1088  #define count_trailing_zeros(count, x)					\
    1089    do {									\
    1090      /* This is tzcnt, spelled for older assemblers.  Destination and */	\
    1091      /* source must be a 64-bit registers, hence cast and %q.         */	\
    1092      __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
    1093    } while (0)
    1094  #define COUNT_TRAILING_ZEROS_0 64
    1095  #else
    1096  #define count_trailing_zeros(count, x)					\
    1097    do {									\
    1098      ASSERT ((x) != 0);							\
    1099      __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x)));	\
    1100    } while (0)
    1101  #endif
    1102  #endif /* __amd64__ */
    1103  
    1104  #if defined (__i860__) && W_TYPE_SIZE == 32
    1105  #define rshift_rhlc(r,h,l,c) \
    1106    __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"				\
    1107  	   "=r" (r) : "r" (h), "r" (l), "rn" (c))
    1108  #endif /* i860 */
    1109  
    1110  #if defined (__i960__) && W_TYPE_SIZE == 32
    1111  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1112    __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"			\
    1113  	   : "=r" (sh), "=&r" (sl)					\
    1114  	   : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
    1115  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1116    __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"			\
    1117  	   : "=r" (sh), "=&r" (sl)					\
    1118  	   : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
    1119  #define umul_ppmm(w1, w0, u, v) \
    1120    ({union {UDItype __ll;						\
    1121  	   struct {USItype __l, __h;} __i;				\
    1122  	  } __x;							\
    1123    __asm__ ("emul %2,%1,%0"						\
    1124  	   : "=d" (__x.__ll) : "%dI" (u), "dI" (v));			\
    1125    (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    1126  #define __umulsidi3(u, v) \
    1127    ({UDItype __w;							\
    1128      __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));	\
    1129      __w; })
    1130  #define udiv_qrnnd(q, r, nh, nl, d) \
    1131    do {									\
    1132      union {UDItype __ll;						\
    1133  	   struct {USItype __l, __h;} __i;				\
    1134  	  } __nn;							\
    1135      __nn.__i.__h = (nh); __nn.__i.__l = (nl);				\
    1136      __asm__ ("ediv %d,%n,%0"						\
    1137  	   : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));		\
    1138      (r) = __rq.__i.__l; (q) = __rq.__i.__h;				\
    1139    } while (0)
    1140  #define count_leading_zeros(count, x) \
    1141    do {									\
    1142      USItype __cbtmp;							\
    1143      __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));		\
    1144      (count) = __cbtmp ^ 31;						\
    1145    } while (0)
    1146  #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
    1147  #if defined (__i960mx)		/* what is the proper symbol to test??? */
    1148  #define rshift_rhlc(r,h,l,c) \
    1149    do {									\
    1150      union {UDItype __ll;						\
    1151  	   struct {USItype __l, __h;} __i;				\
    1152  	  } __nn;							\
    1153      __nn.__i.__h = (h); __nn.__i.__l = (l);				\
    1154      __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));	\
    1155    }
    1156  #endif /* i960mx */
    1157  #endif /* i960 */
    1158  
    1159  
    1160  #if defined (__loongarch64) && W_TYPE_SIZE == 64
    1161  #define umul_ppmm(w1, w0, u, v) \
    1162    do {									\
    1163      UDItype __u = (u), __v = (v);					\
    1164      (w0) = __u * __v;							\
    1165      (w1) = (unsigned __int128__) __u * __v >> 64;			\
    1166    } while (0)
    1167  #endif
    1168  
    1169  
    1170  #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
    1171       || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
    1172       || defined (__mc5307__)) && W_TYPE_SIZE == 32
    1173  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1174    __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"				\
    1175  	   : "=d" (sh), "=&d" (sl)					\
    1176  	   : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),			\
    1177  	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    1178  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1179    __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"				\
    1180  	   : "=d" (sh), "=&d" (sl)					\
    1181  	   : "0" ((USItype)(ah)), "d" ((USItype)(bh)),			\
    1182  	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    1183  /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
    1184  #if defined (__mc68020__) || defined(mc68020) \
    1185       || defined (__mc68030__) || defined (mc68030) \
    1186       || defined (__mc68040__) || defined (mc68040) \
    1187       || defined (__mcpu32__) || defined (mcpu32) \
    1188       || defined (__NeXT__)
    1189  #define umul_ppmm(w1, w0, u, v) \
    1190    __asm__ ("mulu%.l %3,%1:%0"						\
    1191  	   : "=d" (w0), "=d" (w1)					\
    1192  	   : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
    1193  #define udiv_qrnnd(q, r, n1, n0, d) \
    1194    __asm__ ("divu%.l %4,%1:%0"						\
    1195  	   : "=d" (q), "=d" (r)						\
    1196  	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
    1197  #define sdiv_qrnnd(q, r, n1, n0, d) \
    1198    __asm__ ("divs%.l %4,%1:%0"						\
    1199  	   : "=d" (q), "=d" (r)						\
    1200  	   : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
    1201  #else /* for other 68k family members use 16x16->32 multiplication */
    1202  #define umul_ppmm(xh, xl, a, b) \
    1203    do { USItype __umul_tmp1, __umul_tmp2;				\
    1204  	__asm__ ("| Inlined umul_ppmm\n"				\
    1205  "	move%.l	%5,%3\n"						\
    1206  "	move%.l	%2,%0\n"						\
    1207  "	move%.w	%3,%1\n"						\
    1208  "	swap	%3\n"							\
    1209  "	swap	%0\n"							\
    1210  "	mulu%.w	%2,%1\n"						\
    1211  "	mulu%.w	%3,%0\n"						\
    1212  "	mulu%.w	%2,%3\n"						\
    1213  "	swap	%2\n"							\
    1214  "	mulu%.w	%5,%2\n"						\
    1215  "	add%.l	%3,%2\n"						\
    1216  "	jcc	1f\n"							\
    1217  "	add%.l	%#0x10000,%0\n"						\
    1218  "1:	move%.l	%2,%3\n"						\
    1219  "	clr%.w	%2\n"							\
    1220  "	swap	%2\n"							\
    1221  "	swap	%3\n"							\
    1222  "	clr%.w	%3\n"							\
    1223  "	add%.l	%3,%1\n"						\
    1224  "	addx%.l	%2,%0\n"						\
    1225  "	| End inlined umul_ppmm"					\
    1226  	      : "=&d" (xh), "=&d" (xl),					\
    1227  		"=&d" (__umul_tmp1), "=&d" (__umul_tmp2)			\
    1228  	      : "%2" ((USItype)(a)), "d" ((USItype)(b)));		\
    1229    } while (0)
    1230  #endif /* not mc68020 */
    1231  /* The '020, '030, '040 and '060 have bitfield insns.
    1232     GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
    1233     exclude bfffo on that chip (bitfield insns not available).  */
    1234  #if (defined (__mc68020__) || defined (mc68020)    \
    1235       || defined (__mc68030__) || defined (mc68030) \
    1236       || defined (__mc68040__) || defined (mc68040) \
    1237       || defined (__mc68060__) || defined (mc68060) \
    1238       || defined (__NeXT__))			   \
    1239    && ! defined (__mcpu32__)
    1240  #define count_leading_zeros(count, x) \
    1241    __asm__ ("bfffo %1{%b2:%b2},%0"					\
    1242  	   : "=d" (count)						\
    1243  	   : "od" ((USItype) (x)), "n" (0))
    1244  #define COUNT_LEADING_ZEROS_0 32
    1245  #endif
    1246  #endif /* mc68000 */
    1247  
    1248  #if defined (__m88000__) && W_TYPE_SIZE == 32
    1249  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1250    __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"			\
    1251  	   : "=r" (sh), "=&r" (sl)					\
    1252  	   : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
    1253  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1254    __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"			\
    1255  	   : "=r" (sh), "=&r" (sl)					\
    1256  	   : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
    1257  #define count_leading_zeros(count, x) \
    1258    do {									\
    1259      USItype __cbtmp;							\
    1260      __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));			\
    1261      (count) = __cbtmp ^ 31;						\
    1262    } while (0)
    1263  #define COUNT_LEADING_ZEROS_0 63 /* sic */
    1264  #if defined (__m88110__)
    1265  #define umul_ppmm(wh, wl, u, v) \
    1266    do {									\
    1267      union {UDItype __ll;						\
    1268  	   struct {USItype __h, __l;} __i;				\
    1269  	  } __x;							\
    1270      __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));	\
    1271      (wh) = __x.__i.__h;							\
    1272      (wl) = __x.__i.__l;							\
    1273    } while (0)
    1274  #define udiv_qrnnd(q, r, n1, n0, d) \
    1275    ({union {UDItype __ll;						\
    1276  	   struct {USItype __h, __l;} __i;				\
    1277  	  } __x, __q;							\
    1278    __x.__i.__h = (n1); __x.__i.__l = (n0);				\
    1279    __asm__ ("divu.d %0,%1,%2"						\
    1280  	   : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));		\
    1281    (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
    1282  #endif /* __m88110__ */
    1283  #endif /* __m88000__ */
    1284  
    1285  #if defined (__mips) && W_TYPE_SIZE == 32
    1286  #if __GMP_GNUC_PREREQ (4,4)
    1287  #define umul_ppmm(w1, w0, u, v) \
    1288    do {									\
    1289      UDItype __ll = (UDItype)(u) * (v);					\
    1290      w1 = __ll >> 32;							\
    1291      w0 = __ll;								\
    1292    } while (0)
    1293  #endif
    1294  #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
    1295  #define umul_ppmm(w1, w0, u, v) \
    1296    __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
    1297  #endif
    1298  #if !defined (umul_ppmm)
    1299  #define umul_ppmm(w1, w0, u, v) \
    1300    __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"				\
    1301  	   : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
    1302  #endif
    1303  #endif /* __mips */
    1304  
    1305  #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
    1306  #if defined (_MIPS_ARCH_MIPS64R6)
    1307  #define umul_ppmm(w1, w0, u, v) \
    1308    do {									\
    1309      UDItype __m0 = (u), __m1 = (v);					\
    1310      (w0) = __m0 * __m1;							\
    1311      __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1));	\
    1312    } while (0)
    1313  #endif
    1314  #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
    1315  #define umul_ppmm(w1, w0, u, v) \
    1316    do {									\
    1317      typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
    1318      __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
    1319      w1 = __ll >> 64;							\
    1320      w0 = __ll;								\
    1321    } while (0)
    1322  #endif
    1323  #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
    1324  #define umul_ppmm(w1, w0, u, v) \
    1325    __asm__ ("dmultu %2,%3"						\
    1326  	   : "=l" (w0), "=h" (w1)					\
    1327  	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
    1328  #endif
    1329  #if !defined (umul_ppmm)
    1330  #define umul_ppmm(w1, w0, u, v) \
    1331    __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"				\
    1332  	   : "=d" (w0), "=d" (w1)					\
    1333  	   : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
    1334  #endif
    1335  #endif /* __mips */
    1336  
    1337  #if defined (__mmix__) && W_TYPE_SIZE == 64
    1338  #define umul_ppmm(w1, w0, u, v) \
    1339    __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
    1340  #endif
    1341  
    1342  #if defined (__ns32000__) && W_TYPE_SIZE == 32
    1343  #define umul_ppmm(w1, w0, u, v) \
    1344    ({union {UDItype __ll;						\
    1345  	   struct {USItype __l, __h;} __i;				\
    1346  	  } __x;							\
    1347    __asm__ ("meid %2,%0"							\
    1348  	   : "=g" (__x.__ll)						\
    1349  	   : "%0" ((USItype)(u)), "g" ((USItype)(v)));			\
    1350    (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    1351  #define __umulsidi3(u, v) \
    1352    ({UDItype __w;							\
    1353      __asm__ ("meid %2,%0"						\
    1354  	     : "=g" (__w)						\
    1355  	     : "%0" ((USItype)(u)), "g" ((USItype)(v)));		\
    1356      __w; })
    1357  #define udiv_qrnnd(q, r, n1, n0, d) \
    1358    ({union {UDItype __ll;						\
    1359  	   struct {USItype __l, __h;} __i;				\
    1360  	  } __x;							\
    1361    __x.__i.__h = (n1); __x.__i.__l = (n0);				\
    1362    __asm__ ("deid %2,%0"							\
    1363  	   : "=g" (__x.__ll)						\
    1364  	   : "0" (__x.__ll), "g" ((USItype)(d)));			\
    1365    (r) = __x.__i.__l; (q) = __x.__i.__h; })
    1366  #define count_trailing_zeros(count,x) \
    1367    do {									\
    1368      __asm__ ("ffsd	%2,%0"						\
    1369  	     : "=r" (count)						\
    1370  	     : "0" ((USItype) 0), "r" ((USItype) (x)));			\
    1371    } while (0)
    1372  #endif /* __ns32000__ */
    1373  
    1374  /* In the past we had a block of various #defines tested
    1375         _ARCH_PPC    - AIX
    1376         _ARCH_PWR    - AIX
    1377         __powerpc__  - gcc
    1378         __POWERPC__  - BEOS
    1379         __ppc__      - Darwin
    1380         PPC          - old gcc, GNU/Linux, SysV
    1381     The plain PPC test was not good for vxWorks, since PPC is defined on all
    1382     CPUs there (eg. m68k too), as a constant one is expected to compare
    1383     CPU_FAMILY against.
    1384  
    1385     At any rate, this was pretty unattractive and a bit fragile.  The use of
    1386     HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
    1387     getting the desired effect.
    1388  
    1389     ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
    1390     the system vendor compilers.  (Is that vendor compilers with inline asm,
    1391     or what?)  */
    1392  
    1393  #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)	\
    1394    && W_TYPE_SIZE == 32
    1395  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1396    do {									\
    1397      if (__builtin_constant_p (bh) && (bh) == 0)				\
    1398        __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
    1399  	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
    1400  		 __CLOBBER_CC);						\
    1401      else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    1402        __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
    1403  	       : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)	\
    1404  		 __CLOBBER_CC);						\
    1405      else								\
    1406        __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
    1407  	       : "=r" (sh), "=&r" (sl)					\
    1408  	       : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)		\
    1409  		 __CLOBBER_CC);						\
    1410    } while (0)
    1411  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1412    do {									\
    1413      if (__builtin_constant_p (ah) && (ah) == 0)				\
    1414        __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
    1415  	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
    1416  		 __CLOBBER_CC);						\
    1417      else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)		\
    1418        __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
    1419  	       : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)	\
    1420  		 __CLOBBER_CC);						\
    1421      else if (__builtin_constant_p (bh) && (bh) == 0)			\
    1422        __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
    1423  	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
    1424  		 __CLOBBER_CC);						\
    1425      else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)		\
    1426        __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
    1427  	       : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)	\
    1428  		 __CLOBBER_CC);						\
    1429      else								\
    1430        __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"			\
    1431  	       : "=r" (sh), "=&r" (sl)					\
    1432  	       : "r" (ah), "r" (bh), "rI" (al), "r" (bl)		\
    1433  		 __CLOBBER_CC);						\
    1434    } while (0)
    1435  #define count_leading_zeros(count, x) \
    1436    __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
    1437  #define COUNT_LEADING_ZEROS_0 32
    1438  #if HAVE_HOST_CPU_FAMILY_powerpc
    1439  #if __GMP_GNUC_PREREQ (4,4)
    1440  #define umul_ppmm(w1, w0, u, v) \
    1441    do {									\
    1442      UDItype __ll = (UDItype)(u) * (v);					\
    1443      w1 = __ll >> 32;							\
    1444      w0 = __ll;								\
    1445    } while (0)
    1446  #endif
    1447  #if !defined (umul_ppmm)
    1448  #define umul_ppmm(ph, pl, m0, m1) \
    1449    do {									\
    1450      USItype __m0 = (m0), __m1 = (m1);					\
    1451      __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    1452      (pl) = __m0 * __m1;							\
    1453    } while (0)
    1454  #endif
    1455  #define smul_ppmm(ph, pl, m0, m1) \
    1456    do {									\
    1457      SItype __m0 = (m0), __m1 = (m1);					\
    1458      __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));	\
    1459      (pl) = __m0 * __m1;							\
    1460    } while (0)
    1461  #else
    1462  #define smul_ppmm(xh, xl, m0, m1) \
    1463    __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
    1464  #define sdiv_qrnnd(q, r, nh, nl, d) \
    1465    __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
    1466  #endif
    1467  #endif /* 32-bit POWER architecture variants.  */
    1468  
    1469  /* We should test _IBMR2 here when we add assembly support for the system
    1470     vendor compilers.  */
    1471  #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
    1472  #if !defined (_LONG_LONG_LIMB)
    1473  /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
    1474     use adde etc only when not _LONG_LONG_LIMB.  */
    1475  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1476    do {									\
    1477      if (__builtin_constant_p (bh) && (bh) == 0)				\
    1478        __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"			\
    1479  	       : "=r" (sh), "=&r" (sl)					\
    1480  	       : "r"  ((UDItype)(ah)),					\
    1481  		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
    1482  		 __CLOBBER_CC);						\
    1483      else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)		\
    1484        __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"			\
    1485  	       : "=r" (sh), "=&r" (sl)					\
    1486  	       : "r"  ((UDItype)(ah)),					\
    1487  		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
    1488  		 __CLOBBER_CC);						\
    1489      else								\
    1490        __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"			\
    1491  	       : "=r" (sh), "=&r" (sl)					\
    1492  	       : "r"  ((UDItype)(ah)), "r"  ((UDItype)(bh)),		\
    1493  		 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl))		\
    1494  		 __CLOBBER_CC);						\
    1495    } while (0)
    1496  /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
    1497     This might seem strange, but gcc folds away the dead code late.  */
    1498  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1499    do {									\
    1500      if (__builtin_constant_p (bl)					\
    1501  	&& (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) {		\
    1502  	if (__builtin_constant_p (ah) && (ah) == 0)			\
    1503  	  __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"			\
    1504  		   : "=r" (sh), "=&r" (sl)				\
    1505  		   :                       "r" ((UDItype)(bh)),		\
    1506  		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
    1507  		     __CLOBBER_CC);					\
    1508  	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
    1509  	  __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"			\
    1510  		   : "=r" (sh), "=&r" (sl)				\
    1511  		   :                       "r" ((UDItype)(bh)),		\
    1512  		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
    1513  		     __CLOBBER_CC);					\
    1514  	else if (__builtin_constant_p (bh) && (bh) == 0)		\
    1515  	  __asm__ ("addic %1,%3,%4\n\taddme %0,%2"			\
    1516  		   : "=r" (sh), "=&r" (sl)				\
    1517  		   : "r" ((UDItype)(ah)),				\
    1518  		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
    1519  		     __CLOBBER_CC);					\
    1520  	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
    1521  	  __asm__ ("addic %1,%3,%4\n\taddze %0,%2"			\
    1522  		   : "=r" (sh), "=&r" (sl)				\
    1523  		   : "r" ((UDItype)(ah)),				\
    1524  		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
    1525  		     __CLOBBER_CC);					\
    1526  	else								\
    1527  	  __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"			\
    1528  		   : "=r" (sh), "=&r" (sl)				\
    1529  		   : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    1530  		     "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl)))	\
    1531  		     __CLOBBER_CC);					\
    1532      } else {								\
    1533  	if (__builtin_constant_p (ah) && (ah) == 0)			\
    1534  	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"			\
    1535  		   : "=r" (sh), "=&r" (sl)				\
    1536  		   :                       "r" ((UDItype)(bh)),		\
    1537  		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
    1538  		     __CLOBBER_CC);					\
    1539  	else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)	\
    1540  	  __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"			\
    1541  		   : "=r" (sh), "=&r" (sl)				\
    1542  		   :                       "r" ((UDItype)(bh)),		\
    1543  		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
    1544  		     __CLOBBER_CC);					\
    1545  	else if (__builtin_constant_p (bh) && (bh) == 0)		\
    1546  	  __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"			\
    1547  		   : "=r" (sh), "=&r" (sl)				\
    1548  		   : "r"  ((UDItype)(ah)),				\
    1549  		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
    1550  		     __CLOBBER_CC);					\
    1551  	else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)	\
    1552  	  __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"			\
    1553  		   : "=r" (sh), "=&r" (sl)				\
    1554  		   : "r"  ((UDItype)(ah)),				\
    1555  		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
    1556  		     __CLOBBER_CC);					\
    1557  	else								\
    1558  	  __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"		\
    1559  		   : "=r" (sh), "=&r" (sl)				\
    1560  		   : "r"  ((UDItype)(ah)), "r" ((UDItype)(bh)),		\
    1561  		     "rI" ((UDItype)(al)), "r" ((UDItype)(bl))		\
    1562  		     __CLOBBER_CC);					\
    1563      }									\
    1564    } while (0)
    1565  #endif /* ! _LONG_LONG_LIMB */
    1566  #define count_leading_zeros(count, x) \
    1567    __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
    1568  #define COUNT_LEADING_ZEROS_0 64
    1569  #if __GMP_GNUC_PREREQ (4,8)
    1570  #define umul_ppmm(w1, w0, u, v) \
    1571    do {									\
    1572      typedef unsigned int __ll_UTItype __attribute__((mode(TI)));	\
    1573      __ll_UTItype __ll = (__ll_UTItype)(u) * (v);			\
    1574      w1 = __ll >> 64;							\
    1575      w0 = __ll;								\
    1576    } while (0)
    1577  #endif
    1578  #if !defined (umul_ppmm)
    1579  #define umul_ppmm(ph, pl, m0, m1) \
    1580    do {									\
    1581      UDItype __m0 = (m0), __m1 = (m1);					\
    1582      __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
    1583      (pl) = __m0 * __m1;							\
    1584    } while (0)
    1585  #endif
    1586  #define smul_ppmm(ph, pl, m0, m1) \
    1587    do {									\
    1588      DItype __m0 = (m0), __m1 = (m1);					\
    1589      __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1));	\
    1590      (pl) = __m0 * __m1;							\
    1591    } while (0)
    1592  #endif /* 64-bit PowerPC.  */
    1593  
    1594  #if defined (__pyr__) && W_TYPE_SIZE == 32
    1595  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1596    __asm__ ("addw %5,%1\n\taddwc %3,%0"					\
    1597  	   : "=r" (sh), "=&r" (sl)					\
    1598  	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    1599  	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    1600  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1601    __asm__ ("subw %5,%1\n\tsubwb %3,%0"					\
    1602  	   : "=r" (sh), "=&r" (sl)					\
    1603  	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    1604  	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    1605  /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
    1606  #define umul_ppmm(w1, w0, u, v) \
    1607    ({union {UDItype __ll;						\
    1608  	   struct {USItype __h, __l;} __i;				\
    1609  	  } __x;							\
    1610    __asm__ ("movw %1,%R0\n\tuemul %2,%0"					\
    1611  	   : "=&r" (__x.__ll)						\
    1612  	   : "g" ((USItype) (u)), "g" ((USItype)(v)));			\
    1613    (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
    1614  #endif /* __pyr__ */
    1615  
    1616  #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
    1617  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1618    __asm__ ("a %1,%5\n\tae %0,%3"					\
    1619  	   : "=r" (sh), "=&r" (sl)					\
    1620  	   : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),			\
    1621  	     "%1" ((USItype)(al)), "r" ((USItype)(bl)))
    1622  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1623    __asm__ ("s %1,%5\n\tse %0,%3"					\
    1624  	   : "=r" (sh), "=&r" (sl)					\
    1625  	   : "0" ((USItype)(ah)), "r" ((USItype)(bh)),			\
    1626  	     "1" ((USItype)(al)), "r" ((USItype)(bl)))
    1627  #define smul_ppmm(ph, pl, m0, m1) \
    1628    __asm__ (								\
    1629         "s	r2,r2\n"						\
    1630  "	mts r10,%2\n"							\
    1631  "	m	r2,%3\n"						\
    1632  "	m	r2,%3\n"						\
    1633  "	m	r2,%3\n"						\
    1634  "	m	r2,%3\n"						\
    1635  "	m	r2,%3\n"						\
    1636  "	m	r2,%3\n"						\
    1637  "	m	r2,%3\n"						\
    1638  "	m	r2,%3\n"						\
    1639  "	m	r2,%3\n"						\
    1640  "	m	r2,%3\n"						\
    1641  "	m	r2,%3\n"						\
    1642  "	m	r2,%3\n"						\
    1643  "	m	r2,%3\n"						\
    1644  "	m	r2,%3\n"						\
    1645  "	m	r2,%3\n"						\
    1646  "	m	r2,%3\n"						\
    1647  "	cas	%0,r2,r0\n"						\
    1648  "	mfs	r10,%1"							\
    1649  	   : "=r" (ph), "=r" (pl)					\
    1650  	   : "%r" ((USItype)(m0)), "r" ((USItype)(m1))			\
    1651  	   : "r2")
    1652  #define count_leading_zeros(count, x) \
    1653    do {									\
    1654      if ((x) >= 0x10000)							\
    1655        __asm__ ("clz	%0,%1"						\
    1656  	       : "=r" (count) : "r" ((USItype)(x) >> 16));		\
    1657      else								\
    1658        {									\
    1659  	__asm__ ("clz	%0,%1"						\
    1660  		 : "=r" (count) : "r" ((USItype)(x)));			\
    1661  	(count) += 16;							\
    1662        }									\
    1663    } while (0)
    1664  #endif /* RT/ROMP */
    1665  
    1666  #if defined (__riscv) && defined (__riscv_mul) && W_TYPE_SIZE == 64
    1667  #define umul_ppmm(ph, pl, u, v) \
    1668    do {									\
    1669      UDItype __u = (u), __v = (v);					\
    1670      (pl) = __u * __v;							\
    1671      __asm__ ("mulhu\t%0, %1, %2" : "=r" (ph) : "%r" (__u), "r" (__v));	\
    1672    } while (0)
    1673  #endif
    1674  
    1675  #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
    1676  #define umul_ppmm(w1, w0, u, v) \
    1677    __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"		\
    1678  	   : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
    1679  #endif
    1680  
    1681  #if defined (__sparc__) && W_TYPE_SIZE == 32
    1682  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1683    __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"				\
    1684  	   : "=r" (sh), "=&r" (sl)					\
    1685  	   : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)			\
    1686  	   __CLOBBER_CC)
    1687  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1688    __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"				\
    1689  	   : "=r" (sh), "=&r" (sl)					\
    1690  	   : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl)	\
    1691  	   __CLOBBER_CC)
    1692  /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
    1693     doesn't define anything to indicate that to us, it only sets __sparcv8. */
    1694  #if defined (__sparc_v9__) || defined (__sparcv9)
    1695  /* Perhaps we should use floating-point operations here?  */
    1696  #if 0
    1697  /* Triggers a bug making mpz/tests/t-gcd.c fail.
    1698     Perhaps we simply need explicitly zero-extend the inputs?  */
    1699  #define umul_ppmm(w1, w0, u, v) \
    1700    __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :		\
    1701  	   "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
    1702  #else
    1703  /* Use v8 umul until above bug is fixed.  */
    1704  #define umul_ppmm(w1, w0, u, v) \
    1705    __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
    1706  #endif
    1707  /* Use a plain v8 divide for v9.  */
    1708  #define udiv_qrnnd(q, r, n1, n0, d) \
    1709    do {									\
    1710      USItype __q;							\
    1711      __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
    1712  	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
    1713      (r) = (n0) - __q * (d);						\
    1714      (q) = __q;								\
    1715    } while (0)
    1716  #else
    1717  #if defined (__sparc_v8__)   /* gcc normal */				\
    1718    || defined (__sparcv8)     /* gcc solaris */				\
    1719    || HAVE_HOST_CPU_supersparc
    1720  /* Don't match immediate range because, 1) it is not often useful,
    1721     2) the 'I' flag thinks of the range as a 13 bit signed interval,
    1722     while we want to match a 13 bit interval, sign extended to 32 bits,
    1723     but INTERPRETED AS UNSIGNED.  */
    1724  #define umul_ppmm(w1, w0, u, v) \
    1725    __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
    1726  
    1727  #if HAVE_HOST_CPU_supersparc
    1728  #else
    1729  /* Don't use this on SuperSPARC because its udiv only handles 53 bit
    1730     dividends and will trap to the kernel for the rest. */
    1731  #define udiv_qrnnd(q, r, n1, n0, d) \
    1732    do {									\
    1733      USItype __q;							\
    1734      __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"			\
    1735  	     : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));		\
    1736      (r) = (n0) - __q * (d);						\
    1737      (q) = __q;								\
    1738    } while (0)
    1739  #endif /* HAVE_HOST_CPU_supersparc */
    1740  
    1741  #else /* ! __sparc_v8__ */
    1742  #if defined (__sparclite__)
    1743  /* This has hardware multiply but not divide.  It also has two additional
    1744     instructions scan (ffs from high bit) and divscc.  */
    1745  #define umul_ppmm(w1, w0, u, v) \
    1746    __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
    1747  #define udiv_qrnnd(q, r, n1, n0, d) \
    1748    __asm__ ("! Inlined udiv_qrnnd\n"					\
    1749  "	wr	%%g0,%2,%%y	! Not a delayed write for sparclite\n"	\
    1750  "	tst	%%g0\n"							\
    1751  "	divscc	%3,%4,%%g1\n"						\
    1752  "	divscc	%%g1,%4,%%g1\n"						\
    1753  "	divscc	%%g1,%4,%%g1\n"						\
    1754  "	divscc	%%g1,%4,%%g1\n"						\
    1755  "	divscc	%%g1,%4,%%g1\n"						\
    1756  "	divscc	%%g1,%4,%%g1\n"						\
    1757  "	divscc	%%g1,%4,%%g1\n"						\
    1758  "	divscc	%%g1,%4,%%g1\n"						\
    1759  "	divscc	%%g1,%4,%%g1\n"						\
    1760  "	divscc	%%g1,%4,%%g1\n"						\
    1761  "	divscc	%%g1,%4,%%g1\n"						\
    1762  "	divscc	%%g1,%4,%%g1\n"						\
    1763  "	divscc	%%g1,%4,%%g1\n"						\
    1764  "	divscc	%%g1,%4,%%g1\n"						\
    1765  "	divscc	%%g1,%4,%%g1\n"						\
    1766  "	divscc	%%g1,%4,%%g1\n"						\
    1767  "	divscc	%%g1,%4,%%g1\n"						\
    1768  "	divscc	%%g1,%4,%%g1\n"						\
    1769  "	divscc	%%g1,%4,%%g1\n"						\
    1770  "	divscc	%%g1,%4,%%g1\n"						\
    1771  "	divscc	%%g1,%4,%%g1\n"						\
    1772  "	divscc	%%g1,%4,%%g1\n"						\
    1773  "	divscc	%%g1,%4,%%g1\n"						\
    1774  "	divscc	%%g1,%4,%%g1\n"						\
    1775  "	divscc	%%g1,%4,%%g1\n"						\
    1776  "	divscc	%%g1,%4,%%g1\n"						\
    1777  "	divscc	%%g1,%4,%%g1\n"						\
    1778  "	divscc	%%g1,%4,%%g1\n"						\
    1779  "	divscc	%%g1,%4,%%g1\n"						\
    1780  "	divscc	%%g1,%4,%%g1\n"						\
    1781  "	divscc	%%g1,%4,%%g1\n"						\
    1782  "	divscc	%%g1,%4,%0\n"						\
    1783  "	rd	%%y,%1\n"						\
    1784  "	bl,a 1f\n"							\
    1785  "	add	%1,%4,%1\n"						\
    1786  "1:	! End of inline udiv_qrnnd"					\
    1787  	   : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)		\
    1788  	   : "%g1" __AND_CLOBBER_CC)
    1789  #define count_leading_zeros(count, x) \
    1790    __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
    1791  /* Early sparclites return 63 for an argument of 0, but they warn that future
    1792     implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
    1793     undefined.  */
    1794  #endif /* __sparclite__ */
    1795  #endif /* __sparc_v8__ */
    1796  #endif /* __sparc_v9__ */
    1797  /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
    1798  #ifndef umul_ppmm
    1799  #define umul_ppmm(w1, w0, u, v) \
    1800    __asm__ ("! Inlined umul_ppmm\n"					\
    1801  "	wr	%%g0,%2,%%y	! SPARC has 0-3 delay insn after a wr\n" \
    1802  "	sra	%3,31,%%g2	! Don't move this insn\n"		\
    1803  "	and	%2,%%g2,%%g2	! Don't move this insn\n"		\
    1804  "	andcc	%%g0,0,%%g1	! Don't move this insn\n"		\
    1805  "	mulscc	%%g1,%3,%%g1\n"						\
    1806  "	mulscc	%%g1,%3,%%g1\n"						\
    1807  "	mulscc	%%g1,%3,%%g1\n"						\
    1808  "	mulscc	%%g1,%3,%%g1\n"						\
    1809  "	mulscc	%%g1,%3,%%g1\n"						\
    1810  "	mulscc	%%g1,%3,%%g1\n"						\
    1811  "	mulscc	%%g1,%3,%%g1\n"						\
    1812  "	mulscc	%%g1,%3,%%g1\n"						\
    1813  "	mulscc	%%g1,%3,%%g1\n"						\
    1814  "	mulscc	%%g1,%3,%%g1\n"						\
    1815  "	mulscc	%%g1,%3,%%g1\n"						\
    1816  "	mulscc	%%g1,%3,%%g1\n"						\
    1817  "	mulscc	%%g1,%3,%%g1\n"						\
    1818  "	mulscc	%%g1,%3,%%g1\n"						\
    1819  "	mulscc	%%g1,%3,%%g1\n"						\
    1820  "	mulscc	%%g1,%3,%%g1\n"						\
    1821  "	mulscc	%%g1,%3,%%g1\n"						\
    1822  "	mulscc	%%g1,%3,%%g1\n"						\
    1823  "	mulscc	%%g1,%3,%%g1\n"						\
    1824  "	mulscc	%%g1,%3,%%g1\n"						\
    1825  "	mulscc	%%g1,%3,%%g1\n"						\
    1826  "	mulscc	%%g1,%3,%%g1\n"						\
    1827  "	mulscc	%%g1,%3,%%g1\n"						\
    1828  "	mulscc	%%g1,%3,%%g1\n"						\
    1829  "	mulscc	%%g1,%3,%%g1\n"						\
    1830  "	mulscc	%%g1,%3,%%g1\n"						\
    1831  "	mulscc	%%g1,%3,%%g1\n"						\
    1832  "	mulscc	%%g1,%3,%%g1\n"						\
    1833  "	mulscc	%%g1,%3,%%g1\n"						\
    1834  "	mulscc	%%g1,%3,%%g1\n"						\
    1835  "	mulscc	%%g1,%3,%%g1\n"						\
    1836  "	mulscc	%%g1,%3,%%g1\n"						\
    1837  "	mulscc	%%g1,0,%%g1\n"						\
    1838  "	add	%%g1,%%g2,%0\n"						\
    1839  "	rd	%%y,%1"							\
    1840  	   : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)			\
    1841  	   : "%g1", "%g2" __AND_CLOBBER_CC)
    1842  #endif
    1843  #ifndef udiv_qrnnd
    1844  #ifndef LONGLONG_STANDALONE
    1845  #define udiv_qrnnd(q, r, n1, n0, d) \
    1846    do { UWtype __r;							\
    1847      (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));			\
    1848      (r) = __r;								\
    1849    } while (0)
    1850  extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
    1851  #endif /* LONGLONG_STANDALONE */
    1852  #endif /* udiv_qrnnd */
    1853  #endif /* __sparc__ */
    1854  
    1855  #if defined (__sparc__) && W_TYPE_SIZE == 64
    1856  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1857    __asm__ (								\
    1858         "addcc	%r4,%5,%1\n"						\
    1859        "	addccc	%r6,%7,%%g0\n"						\
    1860        "	addc	%r2,%3,%0"						\
    1861         : "=r" (sh), "=&r" (sl)						\
    1862         : "rJ"  ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
    1863  	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
    1864  	 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)	\
    1865  	   __CLOBBER_CC)
    1866  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1867    __asm__ (								\
    1868         "subcc	%r4,%5,%1\n"						\
    1869        "	subccc	%r6,%7,%%g0\n"						\
    1870        "	subc	%r2,%3,%0"						\
    1871         : "=r" (sh), "=&r" (sl)						\
    1872         : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)),			\
    1873  	 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)),			\
    1874  	 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32)		\
    1875  	   __CLOBBER_CC)
    1876  #if __VIS__ >= 0x300
    1877  #undef add_ssaaaa
    1878  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1879    __asm__ (								\
    1880         "addcc	%r4, %5, %1\n"						\
    1881        "	addxc	%r2, %r3, %0"						\
    1882  	  : "=r" (sh), "=&r" (sl)					\
    1883         : "rJ"  ((UDItype)(ah)), "rJ" ((UDItype)(bh)),			\
    1884  	 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
    1885  #define umul_ppmm(ph, pl, m0, m1) \
    1886    do {									\
    1887      UDItype __m0 = (m0), __m1 = (m1);					\
    1888      (pl) = __m0 * __m1;							\
    1889      __asm__ ("umulxhi\t%2, %1, %0"					\
    1890  	     : "=r" (ph)						\
    1891  	     : "%r" (__m0), "r" (__m1));				\
    1892    } while (0)
    1893  #define count_leading_zeros(count, x) \
    1894    __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
    1895  /* Needed by count_leading_zeros_32 in sparc64.h.  */
    1896  #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    1897  #endif
    1898  #endif
    1899  
    1900  #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
    1901  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1902    __asm__ ("addl2 %5,%1\n\tadwc %3,%0"					\
    1903  	   : "=g" (sh), "=&g" (sl)					\
    1904  	   : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),			\
    1905  	     "%1" ((USItype)(al)), "g" ((USItype)(bl)))
    1906  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1907    __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"					\
    1908  	   : "=g" (sh), "=&g" (sl)					\
    1909  	   : "0" ((USItype)(ah)), "g" ((USItype)(bh)),			\
    1910  	     "1" ((USItype)(al)), "g" ((USItype)(bl)))
    1911  #define smul_ppmm(xh, xl, m0, m1) \
    1912    do {									\
    1913      union {UDItype __ll;						\
    1914  	   struct {USItype __l, __h;} __i;				\
    1915  	  } __x;							\
    1916      USItype __m0 = (m0), __m1 = (m1);					\
    1917      __asm__ ("emul %1,%2,$0,%0"						\
    1918  	     : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));		\
    1919      (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    1920    } while (0)
    1921  #define sdiv_qrnnd(q, r, n1, n0, d) \
    1922    do {									\
    1923      union {DItype __ll;							\
    1924  	   struct {SItype __l, __h;} __i;				\
    1925  	  } __x;							\
    1926      __x.__i.__h = n1; __x.__i.__l = n0;					\
    1927      __asm__ ("ediv %3,%2,%0,%1"						\
    1928  	     : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));		\
    1929    } while (0)
    1930  #if 0
    1931  /* FIXME: This instruction appears to be unimplemented on some systems (vax
    1932     8800 maybe). */
    1933  #define count_trailing_zeros(count,x)					\
    1934    do {									\
    1935      __asm__ ("ffs 0, 31, %1, %0"					\
    1936  	     : "=g" (count)						\
    1937  	     : "g" ((USItype) (x)));					\
    1938    } while (0)
    1939  #endif
    1940  #endif /* vax */
    1941  
    1942  #if defined (__z8000__) && W_TYPE_SIZE == 16
    1943  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    1944    __asm__ ("add	%H1,%H5\n\tadc	%H0,%H3"				\
    1945  	   : "=r" (sh), "=&r" (sl)					\
    1946  	   : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
    1947  	     "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
    1948  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    1949    __asm__ ("sub	%H1,%H5\n\tsbc	%H0,%H3"				\
    1950  	   : "=r" (sh), "=&r" (sl)					\
    1951  	   : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),	\
    1952  	     "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
    1953  #define umul_ppmm(xh, xl, m0, m1) \
    1954    do {									\
    1955      union {long int __ll;						\
    1956  	   struct {unsigned int __h, __l;} __i;				\
    1957  	  } __x;							\
    1958      unsigned int __m0 = (m0), __m1 = (m1);				\
    1959      __asm__ ("mult	%S0,%H3"					\
    1960  	     : "=r" (__x.__i.__h), "=r" (__x.__i.__l)			\
    1961  	     : "%1" (m0), "rQR" (m1));					\
    1962      (xh) = __x.__i.__h; (xl) = __x.__i.__l;				\
    1963      (xh) += ((((signed int) __m0 >> 15) & __m1)				\
    1964  	     + (((signed int) __m1 >> 15) & __m0));			\
    1965    } while (0)
    1966  #endif /* __z8000__ */
    1967  
    1968  #endif /* __GNUC__ */
    1969  
    1970  #endif /* NO_ASM */
    1971  
    1972  
    1973  /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
    1974  #if !defined (umul_ppmm) && defined (__umulsidi3)
    1975  #define umul_ppmm(ph, pl, m0, m1) \
    1976    do {									\
    1977      UDWtype __ll = __umulsidi3 (m0, m1);				\
    1978      ph = (UWtype) (__ll >> W_TYPE_SIZE);				\
    1979      pl = (UWtype) __ll;							\
    1980    } while (0)
    1981  #endif
    1982  
    1983  #if !defined (__umulsidi3)
    1984  #define __umulsidi3(u, v) \
    1985    ({UWtype __hi, __lo;							\
    1986      umul_ppmm (__hi, __lo, u, v);					\
    1987      ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
    1988  #endif
    1989  
    1990  
    1991  #if defined (__cplusplus)
    1992  #define __longlong_h_C "C"
    1993  #else
    1994  #define __longlong_h_C
    1995  #endif
    1996  
    1997  /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
    1998     forms have "reversed" arguments, meaning the pointer is last, which
    1999     sometimes allows better parameter passing, in particular on 64-bit
    2000     hppa. */
    2001  
    2002  #define mpn_umul_ppmm  __MPN(umul_ppmm)
    2003  extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
    2004  
    2005  #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
    2006    && ! defined (LONGLONG_STANDALONE)
    2007  #define umul_ppmm(wh, wl, u, v)						\
    2008    do {									\
    2009      UWtype __umul_ppmm__p0;						\
    2010      (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
    2011      (wl) = __umul_ppmm__p0;						\
    2012    } while (0)
    2013  #endif
    2014  
    2015  #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
    2016  extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
    2017  
    2018  #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r	\
    2019    && ! defined (LONGLONG_STANDALONE)
    2020  #define umul_ppmm(wh, wl, u, v)						\
    2021    do {									\
    2022      UWtype __umul_p0;							\
    2023      (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);	\
    2024      (wl) = __umul_p0;							\
    2025    } while (0)
    2026  #endif
    2027  
    2028  #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
    2029  extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
    2030  
    2031  #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd	\
    2032    && ! defined (LONGLONG_STANDALONE)
    2033  #define udiv_qrnnd(q, r, n1, n0, d)					\
    2034    do {									\
    2035      UWtype __udiv_qrnnd_r;						\
    2036      (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,				\
    2037  			  (UWtype) (n1), (UWtype) (n0), (UWtype) d);	\
    2038      (r) = __udiv_qrnnd_r;						\
    2039    } while (0)
    2040  #endif
    2041  
    2042  #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
    2043  extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
    2044  
    2045  #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r	\
    2046    && ! defined (LONGLONG_STANDALONE)
    2047  #define udiv_qrnnd(q, r, n1, n0, d)					\
    2048    do {									\
    2049      UWtype __udiv_qrnnd_r;						\
    2050      (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,	\
    2051  			    &__udiv_qrnnd_r);				\
    2052      (r) = __udiv_qrnnd_r;						\
    2053    } while (0)
    2054  #endif
    2055  
    2056  
    2057  /* If this machine has no inline assembler, use C macros.  */
    2058  
    2059  #if !defined (add_ssaaaa)
    2060  #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
    2061    do {									\
    2062      UWtype __x;								\
    2063      UWtype __al = (al);							\
    2064      UWtype __bl = (bl);							\
    2065      __x = __al + __bl;							\
    2066      (sh) = (ah) + (bh) + (__x < __al);					\
    2067      (sl) = __x;								\
    2068    } while (0)
    2069  #endif
    2070  
    2071  #if !defined (sub_ddmmss)
    2072  #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
    2073    do {									\
    2074      UWtype __x;								\
    2075      UWtype __al = (al);							\
    2076      UWtype __bl = (bl);							\
    2077      __x = __al - __bl;							\
    2078      (sh) = (ah) - (bh) - (__al < __bl);					\
    2079      (sl) = __x;								\
    2080    } while (0)
    2081  #endif
    2082  
    2083  /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
    2084     smul_ppmm.  */
    2085  #if !defined (umul_ppmm) && defined (smul_ppmm)
    2086  #define umul_ppmm(w1, w0, u, v)						\
    2087    do {									\
    2088      UWtype __w1;							\
    2089      UWtype __xm0 = (u), __xm1 = (v);					\
    2090      smul_ppmm (__w1, w0, __xm0, __xm1);					\
    2091      (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
    2092  		+ (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
    2093    } while (0)
    2094  #endif
    2095  
    2096  /* If we still don't have umul_ppmm, define it using plain C.
    2097  
    2098     For reference, when this code is used for squaring (ie. u and v identical
    2099     expressions), gcc recognises __x1 and __x2 are the same and generates 3
    2100     multiplies, not 4.  The subsequent additions could be optimized a bit,
    2101     but the only place GMP currently uses such a square is mpn_sqr_basecase,
    2102     and chips obliged to use this generic C umul will have plenty of worse
    2103     performance problems than a couple of extra instructions on the diagonal
    2104     of sqr_basecase.  */
    2105  
    2106  #if !defined (umul_ppmm)
    2107  #define umul_ppmm(w1, w0, u, v)						\
    2108    do {									\
    2109      UWtype __x0, __x1, __x2, __x3;					\
    2110      UHWtype __ul, __vl, __uh, __vh;					\
    2111      UWtype __u = (u), __v = (v);					\
    2112  									\
    2113      __ul = __ll_lowpart (__u);						\
    2114      __uh = __ll_highpart (__u);						\
    2115      __vl = __ll_lowpart (__v);						\
    2116      __vh = __ll_highpart (__v);						\
    2117  									\
    2118      __x0 = (UWtype) __ul * __vl;					\
    2119      __x1 = (UWtype) __ul * __vh;					\
    2120      __x2 = (UWtype) __uh * __vl;					\
    2121      __x3 = (UWtype) __uh * __vh;					\
    2122  									\
    2123      __x1 += __ll_highpart (__x0);/* this can't give carry */		\
    2124      __x1 += __x2;		/* but this indeed can */		\
    2125      if (__x1 < __x2)		/* did we get it? */			\
    2126        __x3 += __ll_B;		/* yes, add it in the proper pos. */	\
    2127  									\
    2128      (w1) = __x3 + __ll_highpart (__x1);					\
    2129      (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);		\
    2130    } while (0)
    2131  #endif
    2132  
    2133  /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
    2134     exist in one form or another.  */
    2135  #if !defined (smul_ppmm)
    2136  #define smul_ppmm(w1, w0, u, v)						\
    2137    do {									\
    2138      UWtype __w1;							\
    2139      UWtype __xm0 = (u), __xm1 = (v);					\
    2140      umul_ppmm (__w1, w0, __xm0, __xm1);					\
    2141      (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)		\
    2142  		- (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);		\
    2143    } while (0)
    2144  #endif
    2145  
    2146  /* Define this unconditionally, so it can be used for debugging.  */
    2147  #define __udiv_qrnnd_c(q, r, n1, n0, d) \
    2148    do {									\
    2149      UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;			\
    2150  									\
    2151      ASSERT ((d) != 0);							\
    2152      ASSERT ((n1) < (d));						\
    2153  									\
    2154      __d1 = __ll_highpart (d);						\
    2155      __d0 = __ll_lowpart (d);						\
    2156  									\
    2157      __q1 = (n1) / __d1;							\
    2158      __r1 = (n1) - __q1 * __d1;						\
    2159      __m = __q1 * __d0;							\
    2160      __r1 = __r1 * __ll_B | __ll_highpart (n0);				\
    2161      if (__r1 < __m)							\
    2162        {									\
    2163  	__q1--, __r1 += (d);						\
    2164  	if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
    2165  	  if (__r1 < __m)						\
    2166  	    __q1--, __r1 += (d);					\
    2167        }									\
    2168      __r1 -= __m;							\
    2169  									\
    2170      __q0 = __r1 / __d1;							\
    2171      __r0 = __r1  - __q0 * __d1;						\
    2172      __m = __q0 * __d0;							\
    2173      __r0 = __r0 * __ll_B | __ll_lowpart (n0);				\
    2174      if (__r0 < __m)							\
    2175        {									\
    2176  	__q0--, __r0 += (d);						\
    2177  	if (__r0 >= (d))						\
    2178  	  if (__r0 < __m)						\
    2179  	    __q0--, __r0 += (d);					\
    2180        }									\
    2181      __r0 -= __m;							\
    2182  									\
    2183      (q) = __q1 * __ll_B | __q0;						\
    2184      (r) = __r0;								\
    2185    } while (0)
    2186  
    2187  /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
    2188     __udiv_w_sdiv (defined in libgcc or elsewhere).  */
    2189  #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
    2190    && ! defined (LONGLONG_STANDALONE)
    2191  #define udiv_qrnnd(q, r, nh, nl, d) \
    2192    do {									\
    2193      UWtype __r;								\
    2194      (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);				\
    2195      (r) = __r;								\
    2196    } while (0)
    2197  __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
    2198  #endif
    2199  
    2200  /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
    2201  #if !defined (udiv_qrnnd)
    2202  #define UDIV_NEEDS_NORMALIZATION 1
    2203  #define udiv_qrnnd __udiv_qrnnd_c
    2204  #endif
    2205  
    2206  #if !defined (count_leading_zeros)
    2207  #define count_leading_zeros(count, x) \
    2208    do {									\
    2209      UWtype __xr = (x);							\
    2210      UWtype __a;								\
    2211  									\
    2212      if (W_TYPE_SIZE == 32)						\
    2213        {									\
    2214  	__a = __xr < ((UWtype) 1 << 2*__BITS4)				\
    2215  	  ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)		\
    2216  	  : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1		\
    2217  	  : 3*__BITS4 + 1);						\
    2218        }									\
    2219      else								\
    2220        {									\
    2221  	for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)			\
    2222  	  if (((__xr >> __a) & 0xff) != 0)				\
    2223  	    break;							\
    2224  	++__a;								\
    2225        }									\
    2226  									\
    2227      (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];		\
    2228    } while (0)
    2229  /* This version gives a well-defined value for zero. */
    2230  #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
    2231  #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    2232  #define COUNT_LEADING_ZEROS_SLOW
    2233  #endif
    2234  
    2235  /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
    2236  #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
    2237  #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    2238  #endif
    2239  
    2240  #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
    2241  extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
    2242  #endif
    2243  
    2244  #if !defined (count_trailing_zeros)
    2245  #if !defined (COUNT_LEADING_ZEROS_SLOW)
    2246  /* Define count_trailing_zeros using an asm count_leading_zeros.  */
    2247  #define count_trailing_zeros(count, x)					\
    2248    do {									\
    2249      UWtype __ctz_x = (x);						\
    2250      UWtype __ctz_c;							\
    2251      ASSERT (__ctz_x != 0);						\
    2252      count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);			\
    2253      (count) = W_TYPE_SIZE - 1 - __ctz_c;				\
    2254    } while (0)
    2255  #else
    2256  /* Define count_trailing_zeros in plain C, assuming small counts are common.
    2257     We use clz_tab without ado, since the C count_leading_zeros above will have
    2258     pulled it in.  */
    2259  #define count_trailing_zeros(count, x)					\
    2260    do {									\
    2261      UWtype __ctz_x = (x);						\
    2262      int __ctz_c;							\
    2263  									\
    2264      if (LIKELY ((__ctz_x & 0xff) != 0))					\
    2265        (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;			\
    2266      else								\
    2267        {									\
    2268  	for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)	\
    2269  	  {								\
    2270  	    __ctz_x >>= 8;						\
    2271  	    if (LIKELY ((__ctz_x & 0xff) != 0))				\
    2272  	      break;							\
    2273  	  }								\
    2274  									\
    2275  	(count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];		\
    2276        }									\
    2277    } while (0)
    2278  #endif
    2279  #endif
    2280  
    2281  #ifndef UDIV_NEEDS_NORMALIZATION
    2282  #define UDIV_NEEDS_NORMALIZATION 0
    2283  #endif
    2284  
    2285  /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
    2286     that hence the latter should always be used.  */
    2287  #ifndef UDIV_PREINV_ALWAYS
    2288  #define UDIV_PREINV_ALWAYS 0
    2289  #endif