1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2020 Free Software
4 Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
31
32 /* You have to define the following before including this file:
33
34 UWtype -- An unsigned type, default type for operations (typically a "word")
35 UHWtype -- An unsigned type, at least half the size of UWtype
36 UDWtype -- An unsigned type, at least twice as large a UWtype
37 W_TYPE_SIZE -- size in bits of UWtype
38
39 SItype, USItype -- Signed and unsigned 32 bit types
40 DItype, UDItype -- Signed and unsigned 64 bit types
41
42 On a 32 bit machine UWtype should typically be USItype;
43 on a 64 bit machine, UWtype should typically be UDItype.
44
45 Optionally, define:
46
47 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48 NO_ASM -- Disable inline asm
49
50
51 CAUTION! Using this version of longlong.h outside of GMP is not safe. You
52 need to include gmp.h and gmp-impl.h, or certain things might not work as
53 expected.
54 */
55
56 #define __BITS4 (W_TYPE_SIZE / 4)
57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
60
61 /* This is used to make sure no undesirable sharing between different libraries
62 that use this file takes place. */
63 #ifndef __MPN
64 #define __MPN(x) __##x
65 #endif
66
67 /* Define auxiliary asm macros.
68
69 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71 word product in HIGH_PROD and LOW_PROD.
72
73 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74 UDWtype product. This is just a variant of umul_ppmm.
75
76 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77 denominator) divides a UDWtype, composed by the UWtype integers
78 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
80 than DENOMINATOR for correct operation. If, in addition, the most
81 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82 UDIV_NEEDS_NORMALIZATION is defined to 1.
83
84 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
86 is rounded towards 0.
87
88 5) count_leading_zeros(count, x) counts the number of zero-bits from the
89 msb to the first non-zero bit in the UWtype X. This is the number of
90 steps X needs to be shifted left to set the msb. Undefined for X == 0,
91 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
92
93 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94 from the least significant end.
95
96 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97 high_addend_2, low_addend_2) adds two UWtype integers, composed by
98 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
100 (i.e. carry out) is not stored anywhere, and is lost.
101
102 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
106 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
107 and is lost.
108
109 If any of these macros are left undefined for a particular CPU,
110 C macros are used.
111
112
113 Notes:
114
115 For add_ssaaaa the two high and two low addends can both commute, but
116 unfortunately gcc only supports one "%" commutative in each asm block.
117 This has always been so but is only documented in recent versions
118 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
119 compiler error in certain rare circumstances.
120
121 Apparently it was only the last "%" that was ever actually respected, so
122 the code has been updated to leave just that. Clearly there's a free
123 choice whether high or low should get it, if there's a reason to favour
124 one over the other. Also obviously when the constraints on the two
125 operands are identical there's no benefit to the reloader in any "%" at
126 all.
127
128 */
129
130 /* The CPUs come in alphabetical order below.
131
132 Please add support for more CPUs here, or improve the current support
133 for the CPUs below! */
134
135
136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
139 __builtin_ctzll.
140
141 These builtins are only used when we check what code comes out, on some
142 chips they're merely libgcc calls, where we will instead want an inline
143 in that case (either asm or generic C).
144
145 These builtins are better than an asm block of the same insn, since an
146 asm block doesn't give gcc any information about scheduling or resource
147 usage. We keep an asm block for use on prior versions of gcc though.
148
149 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150 it's not used (for count_leading_zeros) because it generally gives extra
151 code to ensure the result is 0 when the input is 0, which we don't need
152 or want. */
153
154 #ifdef _LONG_LONG_LIMB
155 #define count_leading_zeros_gcc_clz(count,x) \
156 do { \
157 ASSERT ((x) != 0); \
158 (count) = __builtin_clzll (x); \
159 } while (0)
160 #else
161 #define count_leading_zeros_gcc_clz(count,x) \
162 do { \
163 ASSERT ((x) != 0); \
164 (count) = __builtin_clzl (x); \
165 } while (0)
166 #endif
167
168 #ifdef _LONG_LONG_LIMB
169 #define count_trailing_zeros_gcc_ctz(count,x) \
170 do { \
171 ASSERT ((x) != 0); \
172 (count) = __builtin_ctzll (x); \
173 } while (0)
174 #else
175 #define count_trailing_zeros_gcc_ctz(count,x) \
176 do { \
177 ASSERT ((x) != 0); \
178 (count) = __builtin_ctzl (x); \
179 } while (0)
180 #endif
181
182
183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184 don't need to be under !NO_ASM */
185 #if ! defined (NO_ASM)
186
187 #if defined (__alpha) && W_TYPE_SIZE == 64
188 /* Most alpha-based machines, except Cray systems. */
189 #if defined (__GNUC__)
190 #if __GMP_GNUC_PREREQ (3,3)
191 #define umul_ppmm(ph, pl, m0, m1) \
192 do { \
193 UDItype __m0 = (m0), __m1 = (m1); \
194 (ph) = __builtin_alpha_umulh (__m0, __m1); \
195 (pl) = __m0 * __m1; \
196 } while (0)
197 #else
198 #define umul_ppmm(ph, pl, m0, m1) \
199 do { \
200 UDItype __m0 = (m0), __m1 = (m1); \
201 __asm__ ("umulh %r1,%2,%0" \
202 : "=r" (ph) \
203 : "%rJ" (__m0), "rI" (__m1)); \
204 (pl) = __m0 * __m1; \
205 } while (0)
206 #endif
207 #else /* ! __GNUC__ */
208 #include <machine/builtins.h>
209 #define umul_ppmm(ph, pl, m0, m1) \
210 do { \
211 UDItype __m0 = (m0), __m1 = (m1); \
212 (ph) = __UMULH (__m0, __m1); \
213 (pl) = __m0 * __m1; \
214 } while (0)
215 #endif
216 #ifndef LONGLONG_STANDALONE
217 #define udiv_qrnnd(q, r, n1, n0, d) \
218 do { UWtype __di; \
219 __di = __MPN(invert_limb) (d); \
220 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
221 } while (0)
222 #define UDIV_PREINV_ALWAYS 1
223 #define UDIV_NEEDS_NORMALIZATION 1
224 #endif /* LONGLONG_STANDALONE */
225
226 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
227 always goes into libgmp.so, even when not actually used. */
228 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
229
230 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
231 #define count_leading_zeros(COUNT,X) \
232 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
233 #define count_trailing_zeros(COUNT,X) \
234 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
235 #endif /* clz/ctz using cix */
236
237 #if ! defined (count_leading_zeros) \
238 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
239 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
240 "$31" is written explicitly in the asm, since an "r" constraint won't
241 select reg 31. There seems no need to worry about "r31" syntax for cray,
242 since gcc itself (pre-release 3.4) emits just $31 in various places. */
243 #define ALPHA_CMPBGE_0(dst, src) \
244 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
245 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
246 them, locating the highest non-zero byte. A second __clz_tab lookup
247 counts the leading zero bits in that byte, giving the result. */
248 #define count_leading_zeros(count, x) \
249 do { \
250 UWtype __clz__b, __clz__c, __clz__x = (x); \
251 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
252 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
253 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
254 __clz__x >>= __clz__b; \
255 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
256 __clz__b = 65 - __clz__b; \
257 (count) = __clz__b - __clz__c; \
258 } while (0)
259 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
260 #endif /* clz using cmpbge */
261
262 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
263 #if HAVE_ATTRIBUTE_CONST
264 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
265 #else
266 long __MPN(count_leading_zeros) (UDItype);
267 #endif
268 #define count_leading_zeros(count, x) \
269 ((count) = __MPN(count_leading_zeros) (x))
270 #endif /* clz using mpn */
271 #endif /* __alpha */
272
273 #if defined (__AVR) && W_TYPE_SIZE == 8
274 #define umul_ppmm(ph, pl, m0, m1) \
275 do { \
276 unsigned short __p = (unsigned short) (m0) * (m1); \
277 (ph) = __p >> 8; \
278 (pl) = __p; \
279 } while (0)
280 #endif /* AVR */
281
282 #if defined (_CRAY) && W_TYPE_SIZE == 64
283 #include <intrinsics.h>
284 #define UDIV_PREINV_ALWAYS 1
285 #define UDIV_NEEDS_NORMALIZATION 1
286 long __MPN(count_leading_zeros) (UDItype);
287 #define count_leading_zeros(count, x) \
288 ((count) = _leadz ((UWtype) (x)))
289 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
290 #define umul_ppmm(ph, pl, m0, m1) \
291 do { \
292 UDItype __m0 = (m0), __m1 = (m1); \
293 (ph) = _int_mult_upper (__m0, __m1); \
294 (pl) = __m0 * __m1; \
295 } while (0)
296 #ifndef LONGLONG_STANDALONE
297 #define udiv_qrnnd(q, r, n1, n0, d) \
298 do { UWtype __di; \
299 __di = __MPN(invert_limb) (d); \
300 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
301 } while (0)
302 #endif /* LONGLONG_STANDALONE */
303 #endif /* _CRAYIEEE */
304 #endif /* _CRAY */
305
306 #if defined (__ia64) && W_TYPE_SIZE == 64
307 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
308 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
309 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
310 register, which takes an extra cycle. */
311 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
312 do { \
313 UWtype __x; \
314 __x = (al) - (bl); \
315 if ((al) < (bl)) \
316 (sh) = (ah) - (bh) - 1; \
317 else \
318 (sh) = (ah) - (bh); \
319 (sl) = __x; \
320 } while (0)
321 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
322 /* Do both product parts in assembly, since that gives better code with
323 all gcc versions. Some callers will just use the upper part, and in
324 that situation we waste an instruction, but not any cycles. */
325 #define umul_ppmm(ph, pl, m0, m1) \
326 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
327 : "=&f" (ph), "=f" (pl) \
328 : "f" (m0), "f" (m1))
329 #define count_leading_zeros(count, x) \
330 do { \
331 UWtype _x = (x), _y, _a, _c; \
332 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
333 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
334 _c = (_a - 1) << 3; \
335 _x >>= _c; \
336 if (_x >= 1 << 4) \
337 _x >>= 4, _c += 4; \
338 if (_x >= 1 << 2) \
339 _x >>= 2, _c += 2; \
340 _c += _x >> 1; \
341 (count) = W_TYPE_SIZE - 1 - _c; \
342 } while (0)
343 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
344 based, and we don't need a special case for x==0 here */
345 #define count_trailing_zeros(count, x) \
346 do { \
347 UWtype __ctz_x = (x); \
348 __asm__ ("popcnt %0 = %1" \
349 : "=r" (count) \
350 : "r" ((__ctz_x-1) & ~__ctz_x)); \
351 } while (0)
352 #endif
353 #if defined (__INTEL_COMPILER)
354 #include <ia64intrin.h>
355 #define umul_ppmm(ph, pl, m0, m1) \
356 do { \
357 UWtype __m0 = (m0), __m1 = (m1); \
358 ph = _m64_xmahu (__m0, __m1, 0); \
359 pl = __m0 * __m1; \
360 } while (0)
361 #endif
362 #ifndef LONGLONG_STANDALONE
363 #define udiv_qrnnd(q, r, n1, n0, d) \
364 do { UWtype __di; \
365 __di = __MPN(invert_limb) (d); \
366 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
367 } while (0)
368 #define UDIV_PREINV_ALWAYS 1
369 #define UDIV_NEEDS_NORMALIZATION 1
370 #endif
371 #endif
372
373
374 #if defined (__GNUC__)
375
376 /* We sometimes need to clobber "cc" with gcc2, but that would not be
377 understood by gcc1. Use cpp to avoid major code duplication. */
378 #if __GNUC__ < 2
379 #define __CLOBBER_CC
380 #define __AND_CLOBBER_CC
381 #else /* __GNUC__ >= 2 */
382 #define __CLOBBER_CC : "cc"
383 #define __AND_CLOBBER_CC , "cc"
384 #endif /* __GNUC__ < 2 */
385
386 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
387 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
388 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
389 : "=r" (sh), "=&r" (sl) \
390 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
391 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
392 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
393 : "=r" (sh), "=&r" (sl) \
394 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
395 #define umul_ppmm(xh, xl, m0, m1) \
396 do { \
397 USItype __m0 = (m0), __m1 = (m1); \
398 __asm__ ("multiplu %0,%1,%2" \
399 : "=r" (xl) \
400 : "r" (__m0), "r" (__m1)); \
401 __asm__ ("multmu %0,%1,%2" \
402 : "=r" (xh) \
403 : "r" (__m0), "r" (__m1)); \
404 } while (0)
405 #define udiv_qrnnd(q, r, n1, n0, d) \
406 __asm__ ("dividu %0,%3,%4" \
407 : "=r" (q), "=q" (r) \
408 : "1" (n1), "r" (n0), "r" (d))
409 #define count_leading_zeros(count, x) \
410 __asm__ ("clz %0,%1" \
411 : "=r" (count) \
412 : "r" (x))
413 #define COUNT_LEADING_ZEROS_0 32
414 #endif /* __a29k__ */
415
416 #if defined (__arc__)
417 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
418 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
419 : "=r" (sh), \
420 "=&r" (sl) \
421 : "r" ((USItype) (ah)), \
422 "rICal" ((USItype) (bh)), \
423 "%r" ((USItype) (al)), \
424 "rICal" ((USItype) (bl)))
425 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
426 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
427 : "=r" (sh), \
428 "=&r" (sl) \
429 : "r" ((USItype) (ah)), \
430 "rICal" ((USItype) (bh)), \
431 "r" ((USItype) (al)), \
432 "rICal" ((USItype) (bl)))
433 #endif
434
435 #if defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) \
436 && W_TYPE_SIZE == 32
437 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
438 do { \
439 if (__builtin_constant_p (bl) && -(USItype)(bl) < (USItype)(bl)) \
440 __asm__ ("subs\t%1, %4, %5\n\tadc\t%0, %2, %3" \
441 : "=r" (sh), "=&r" (sl) \
442 : "r" (ah), "rI" (bh), \
443 "%r" (al), "rI" (-(USItype)(bl)) __CLOBBER_CC); \
444 else \
445 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
446 : "=r" (sh), "=&r" (sl) \
447 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC); \
448 } while (0)
449 /* FIXME: Extend the immediate range for the low word by using both ADDS and
450 SUBS, since they set carry in the same way. We need separate definitions
451 for thumb and non-thumb since thumb lacks RSC. */
452 #if defined (__thumb__)
453 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
454 do { \
455 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
456 && (ah) == (bh)) \
457 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
458 : "=r" (sh), "=r" (sl) \
459 : "r" (al), "rI" (bl) __CLOBBER_CC); \
460 else if (__builtin_constant_p (al)) \
461 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
462 : "=r" (sh), "=&r" (sl) \
463 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
464 else \
465 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
466 : "=r" (sh), "=&r" (sl) \
467 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
468 } while (0)
469 #else
470 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
471 do { \
472 if (__builtin_constant_p (ah) && __builtin_constant_p (bh) \
473 && (ah) == (bh)) \
474 __asm__ ("subs\t%1, %2, %3\n\tsbc\t%0, %0, %0" \
475 : "=r" (sh), "=r" (sl) \
476 : "r" (al), "rI" (bl) __CLOBBER_CC); \
477 else if (__builtin_constant_p (al)) \
478 { \
479 if (__builtin_constant_p (ah)) \
480 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
481 : "=r" (sh), "=&r" (sl) \
482 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
483 else \
484 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
485 : "=r" (sh), "=&r" (sl) \
486 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
487 } \
488 else if (__builtin_constant_p (ah)) \
489 { \
490 if (__builtin_constant_p (bl)) \
491 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
492 : "=r" (sh), "=&r" (sl) \
493 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
494 else \
495 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
496 : "=r" (sh), "=&r" (sl) \
497 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
498 } \
499 else \
500 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
501 : "=r" (sh), "=&r" (sl) \
502 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
503 } while (0)
504 #endif
505 #if defined (__ARM_ARCH_2__) || defined (__ARM_ARCH_2A__) \
506 || defined (__ARM_ARCH_3__)
507 #define umul_ppmm(xh, xl, a, b) \
508 do { \
509 register USItype __t0, __t1, __t2; \
510 __asm__ ("%@ Inlined umul_ppmm\n" \
511 " mov %2, %5, lsr #16\n" \
512 " mov %0, %6, lsr #16\n" \
513 " bic %3, %5, %2, lsl #16\n" \
514 " bic %4, %6, %0, lsl #16\n" \
515 " mul %1, %3, %4\n" \
516 " mul %4, %2, %4\n" \
517 " mul %3, %0, %3\n" \
518 " mul %0, %2, %0\n" \
519 " adds %3, %4, %3\n" \
520 " addcs %0, %0, #65536\n" \
521 " adds %1, %1, %3, lsl #16\n" \
522 " adc %0, %0, %3, lsr #16" \
523 : "=&r" ((USItype) (xh)), "=r" ((USItype) (xl)), \
524 "=&r" (__t0), "=&r" (__t1), "=r" (__t2) \
525 : "r" ((USItype) (a)), "r" ((USItype) (b)) __CLOBBER_CC); \
526 } while (0)
527 #ifndef LONGLONG_STANDALONE
528 #define udiv_qrnnd(q, r, n1, n0, d) \
529 do { UWtype __r; \
530 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
531 (r) = __r; \
532 } while (0)
533 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
534 #endif /* LONGLONG_STANDALONE */
535 #else /* ARMv4 or newer */
536 #define umul_ppmm(xh, xl, a, b) \
537 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
538 #define smul_ppmm(xh, xl, a, b) \
539 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
540 #ifndef LONGLONG_STANDALONE
541 #define udiv_qrnnd(q, r, n1, n0, d) \
542 do { UWtype __di; \
543 __di = __MPN(invert_limb) (d); \
544 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
545 } while (0)
546 #define UDIV_PREINV_ALWAYS 1
547 #define UDIV_NEEDS_NORMALIZATION 1
548 #endif /* LONGLONG_STANDALONE */
549 #endif /* defined(__ARM_ARCH_2__) ... */
550 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
551 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
552 #endif /* __arm__ */
553
554 #if defined (__aarch64__) && W_TYPE_SIZE == 64
555 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
556 do { \
557 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \
558 __asm__ ("subs\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
559 : "=r" (sh), "=&r" (sl) \
560 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
561 "%r" ((UDItype)(al)), "rI" (-(UDItype)(bl)) __CLOBBER_CC);\
562 else \
563 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
564 : "=r" (sh), "=&r" (sl) \
565 : "rZ" ((UDItype)(ah)), "rZ" ((UDItype)(bh)), \
566 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC);\
567 } while (0)
568 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
569 do { \
570 if (__builtin_constant_p (bl) && ~(UDItype)(bl) <= (UDItype)(bl)) \
571 __asm__ ("adds\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
572 : "=r,r" (sh), "=&r,&r" (sl) \
573 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
574 "r,Z" ((UDItype)(al)), "rI,r" (-(UDItype)(bl)) __CLOBBER_CC);\
575 else \
576 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
577 : "=r,r" (sh), "=&r,&r" (sl) \
578 : "rZ,rZ" ((UDItype)(ah)), "rZ,rZ" ((UDItype)(bh)), \
579 "r,Z" ((UDItype)(al)), "rI,r" ((UDItype)(bl)) __CLOBBER_CC);\
580 } while(0);
581 #if __GMP_GNUC_PREREQ (4,9)
582 #define umul_ppmm(w1, w0, u, v) \
583 do { \
584 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
585 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
586 w1 = __ll >> 64; \
587 w0 = __ll; \
588 } while (0)
589 #endif
590 #if !defined (umul_ppmm)
591 #define umul_ppmm(ph, pl, m0, m1) \
592 do { \
593 UDItype __m0 = (m0), __m1 = (m1); \
594 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (__m0), "r" (__m1)); \
595 (pl) = __m0 * __m1; \
596 } while (0)
597 #endif
598 #define count_leading_zeros(count, x) count_leading_zeros_gcc_clz(count, x)
599 #define count_trailing_zeros(count, x) count_trailing_zeros_gcc_ctz(count, x)
600 #endif /* __aarch64__ */
601
602 #if defined (__clipper__) && W_TYPE_SIZE == 32
603 #define umul_ppmm(w1, w0, u, v) \
604 ({union {UDItype __ll; \
605 struct {USItype __l, __h;} __i; \
606 } __x; \
607 __asm__ ("mulwux %2,%0" \
608 : "=r" (__x.__ll) \
609 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
610 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
611 #define smul_ppmm(w1, w0, u, v) \
612 ({union {DItype __ll; \
613 struct {SItype __l, __h;} __i; \
614 } __x; \
615 __asm__ ("mulwx %2,%0" \
616 : "=r" (__x.__ll) \
617 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
618 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
619 #define __umulsidi3(u, v) \
620 ({UDItype __w; \
621 __asm__ ("mulwux %2,%0" \
622 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
623 __w; })
624 #endif /* __clipper__ */
625
626 /* Fujitsu vector computers. */
627 #if defined (__uxp__) && W_TYPE_SIZE == 32
628 #define umul_ppmm(ph, pl, u, v) \
629 do { \
630 union {UDItype __ll; \
631 struct {USItype __h, __l;} __i; \
632 } __x; \
633 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
634 (ph) = __x.__i.__h; \
635 (pl) = __x.__i.__l; \
636 } while (0)
637 #define smul_ppmm(ph, pl, u, v) \
638 do { \
639 union {UDItype __ll; \
640 struct {USItype __h, __l;} __i; \
641 } __x; \
642 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
643 (ph) = __x.__i.__h; \
644 (pl) = __x.__i.__l; \
645 } while (0)
646 #endif
647
648 #if defined (__gmicro__) && W_TYPE_SIZE == 32
649 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
650 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
651 : "=g" (sh), "=&g" (sl) \
652 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
653 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
654 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
655 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
656 : "=g" (sh), "=&g" (sl) \
657 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
658 "1" ((USItype)(al)), "g" ((USItype)(bl)))
659 #define umul_ppmm(ph, pl, m0, m1) \
660 __asm__ ("mulx %3,%0,%1" \
661 : "=g" (ph), "=r" (pl) \
662 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
663 #define udiv_qrnnd(q, r, nh, nl, d) \
664 __asm__ ("divx %4,%0,%1" \
665 : "=g" (q), "=r" (r) \
666 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
667 #define count_leading_zeros(count, x) \
668 __asm__ ("bsch/1 %1,%0" \
669 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
670 #endif
671
672 #if defined (__hppa) && W_TYPE_SIZE == 32
673 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
674 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
675 : "=r" (sh), "=&r" (sl) \
676 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
677 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
678 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
679 : "=r" (sh), "=&r" (sl) \
680 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
681 #if defined (_PA_RISC1_1)
682 #define umul_ppmm(wh, wl, u, v) \
683 do { \
684 union {UDItype __ll; \
685 struct {USItype __h, __l;} __i; \
686 } __x; \
687 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
688 (wh) = __x.__i.__h; \
689 (wl) = __x.__i.__l; \
690 } while (0)
691 #endif
692 #define count_leading_zeros(count, x) \
693 do { \
694 USItype __tmp; \
695 __asm__ ( \
696 "ldi 1,%0\n" \
697 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
698 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
699 " ldo 16(%0),%0 ; Yes. Perform add.\n" \
700 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
701 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
702 " ldo 8(%0),%0 ; Yes. Perform add.\n" \
703 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
704 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
705 " ldo 4(%0),%0 ; Yes. Perform add.\n" \
706 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
707 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
708 " ldo 2(%0),%0 ; Yes. Perform add.\n" \
709 " extru %1,30,1,%1 ; Extract bit 1.\n" \
710 " sub %0,%1,%0 ; Subtract it.\n" \
711 : "=r" (count), "=r" (__tmp) : "1" (x)); \
712 } while (0)
713 #endif /* hppa */
714
715 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
716 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
717 is just a case of no direct support for 2.0n but treating it like 1.0. */
718 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
719 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
720 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
721 : "=r" (sh), "=&r" (sl) \
722 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
723 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
724 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
725 : "=r" (sh), "=&r" (sl) \
726 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
727 #endif /* hppa */
728
729 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
730 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
731 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
732 do { \
733 /* if (__builtin_constant_p (bl)) \
734 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \
735 : "=r" (sh), "=&r" (sl) \
736 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
737 else \
738 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \
739 : "=r" (sh), "=&r" (sl) \
740 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
741 } while (0)
742 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
743 do { \
744 /* if (__builtin_constant_p (bl)) \
745 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \
746 : "=r" (sh), "=&r" (sl) \
747 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \
748 else \
749 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \
750 : "=r" (sh), "=&r" (sl) \
751 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \
752 } while (0)
753 #if __GMP_GNUC_PREREQ (4,5)
754 #define umul_ppmm(xh, xl, m0, m1) \
755 do { \
756 union {UDItype __ll; \
757 struct {USItype __h, __l;} __i; \
758 } __x; \
759 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \
760 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
761 } while (0)
762 #else
763 #if 0
764 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
765 with a new enough processor pretending we have 32-bit registers. */
766 #define umul_ppmm(xh, xl, m0, m1) \
767 do { \
768 union {UDItype __ll; \
769 struct {USItype __h, __l;} __i; \
770 } __x; \
771 __asm__ ("mlr\t%0,%2" \
772 : "=r" (__x.__ll) \
773 : "%0" (m0), "r" (m1)); \
774 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
775 } while (0)
776 #else
777 #define umul_ppmm(xh, xl, m0, m1) \
778 do { \
779 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
780 DImode for the product, since that would be allocated to a single 64-bit
781 register, whereas mlr uses the low 32-bits of an even-odd register pair.
782 */ \
783 register USItype __r0 __asm__ ("0"); \
784 register USItype __r1 __asm__ ("1") = (m0); \
785 __asm__ ("mlr\t%0,%3" \
786 : "=r" (__r0), "=r" (__r1) \
787 : "r" (__r1), "r" (m1)); \
788 (xh) = __r0; (xl) = __r1; \
789 } while (0)
790 #endif /* if 0 */
791 #endif
792 #if 0
793 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
794 with a new enough processor pretending we have 32-bit registers. */
795 #define udiv_qrnnd(q, r, n1, n0, d) \
796 do { \
797 union {UDItype __ll; \
798 struct {USItype __h, __l;} __i; \
799 } __x; \
800 __x.__i.__h = n1; __x.__i.__l = n0; \
801 __asm__ ("dlr\t%0,%2" \
802 : "=r" (__x.__ll) \
803 : "0" (__x.__ll), "r" (d)); \
804 (q) = __x.__i.__l; (r) = __x.__i.__h; \
805 } while (0)
806 #else
807 #define udiv_qrnnd(q, r, n1, n0, d) \
808 do { \
809 register USItype __r0 __asm__ ("0") = (n1); \
810 register USItype __r1 __asm__ ("1") = (n0); \
811 __asm__ ("dlr\t%0,%4" \
812 : "=r" (__r0), "=r" (__r1) \
813 : "r" (__r0), "r" (__r1), "r" (d)); \
814 (q) = __r1; (r) = __r0; \
815 } while (0)
816 #endif /* if 0 */
817 #else /* if __zarch__ */
818 /* FIXME: this fails if gcc knows about the 64-bit registers. */
819 #define smul_ppmm(xh, xl, m0, m1) \
820 do { \
821 union {DItype __ll; \
822 struct {USItype __h, __l;} __i; \
823 } __x; \
824 __asm__ ("mr\t%0,%2" \
825 : "=r" (__x.__ll) \
826 : "%0" (m0), "r" (m1)); \
827 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
828 } while (0)
829 /* FIXME: this fails if gcc knows about the 64-bit registers. */
830 #define sdiv_qrnnd(q, r, n1, n0, d) \
831 do { \
832 union {DItype __ll; \
833 struct {USItype __h, __l;} __i; \
834 } __x; \
835 __x.__i.__h = n1; __x.__i.__l = n0; \
836 __asm__ ("dr\t%0,%2" \
837 : "=r" (__x.__ll) \
838 : "0" (__x.__ll), "r" (d)); \
839 (q) = __x.__i.__l; (r) = __x.__i.__h; \
840 } while (0)
841 #endif /* if __zarch__ */
842 #endif
843
844 #if defined (__s390x__) && W_TYPE_SIZE == 64
845 /* We need to cast operands with register constraints, otherwise their types
846 will be assumed to be SImode by gcc. For these machines, such operations
847 will insert a value into the low 32 bits, and leave the high 32 bits with
848 garbage. */
849 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
850 do { \
851 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \
852 : "=r" (sh), "=&r" (sl) \
853 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
854 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
855 } while (0)
856 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
857 do { \
858 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \
859 : "=r" (sh), "=&r" (sl) \
860 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
861 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
862 } while (0)
863 #if !defined (__clang__)
864 #define umul_ppmm(xh, xl, m0, m1) \
865 do { \
866 union {unsigned int __attribute__ ((mode(TI))) __ll; \
867 struct {UDItype __h, __l;} __i; \
868 } __x; \
869 __asm__ ("mlgr\t%0,%2" \
870 : "=r" (__x.__ll) \
871 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \
872 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
873 } while (0)
874 #define udiv_qrnnd(q, r, n1, n0, d) \
875 do { \
876 union {unsigned int __attribute__ ((mode(TI))) __ll; \
877 struct {UDItype __h, __l;} __i; \
878 } __x; \
879 __x.__i.__h = n1; __x.__i.__l = n0; \
880 __asm__ ("dlgr\t%0,%2" \
881 : "=r" (__x.__ll) \
882 : "0" (__x.__ll), "r" ((UDItype)(d))); \
883 (q) = __x.__i.__l; (r) = __x.__i.__h; \
884 } while (0)
885 #endif
886 #if 0 /* FIXME: Enable for z10 (?) */
887 #define count_leading_zeros(cnt, x) \
888 do { \
889 union {unsigned int __attribute__ ((mode(TI))) __ll; \
890 struct {UDItype __h, __l;} __i; \
891 } __clr_cnt; \
892 __asm__ ("flogr\t%0,%1" \
893 : "=r" (__clr_cnt.__ll) \
894 : "r" (x) __CLOBBER_CC); \
895 (cnt) = __clr_cnt.__i.__h; \
896 } while (0)
897 #endif
898 #endif
899
900 /* On x86 and x86_64, every asm implicitly clobbers "flags" and "fpsr",
901 so we don't need __CLOBBER_CC. */
902 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
903 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
904 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
905 : "=r" (sh), "=&r" (sl) \
906 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
907 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
908 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
909 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
910 : "=r" (sh), "=&r" (sl) \
911 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
912 "1" ((USItype)(al)), "g" ((USItype)(bl)))
913 #define umul_ppmm(w1, w0, u, v) \
914 __asm__ ("mull %3" \
915 : "=a" (w0), "=d" (w1) \
916 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
917 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
918 __asm__ ("divl %4" /* stringification in K&R C */ \
919 : "=a" (q), "=d" (r) \
920 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
921
922 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
923 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
924 significant 1 bit is, hence the use of the following alternatives. bsfl
925 is slow too, between 18 and 42 depending where the least significant 1
926 bit is, so let the generic count_trailing_zeros below make use of the
927 count_leading_zeros here too. */
928
929 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
930 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
931 cache miss reading from __clz_tab. For P55 it's favoured over the float
932 below so as to avoid mixing MMX and x87, since the penalty for switching
933 between the two is about 100 cycles.
934
935 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
936 16, -1 for 8, or 0 otherwise. This could be written equivalently as
937 follows, but as of gcc 2.95.2 it results in conditional jumps.
938
939 __shift = -(__n < 0x1000000);
940 __shift -= (__n < 0x10000);
941 __shift -= (__n < 0x100);
942
943 The middle two sbbl and cmpl's pair, and with luck something gcc
944 generates might pair with the first cmpl and the last sbbl. The "32+1"
945 constant could be folded into __clz_tab[], but it doesn't seem worth
946 making a different table just for that. */
947
948 #define count_leading_zeros(c,n) \
949 do { \
950 USItype __n = (n); \
951 USItype __shift; \
952 __asm__ ("cmpl $0x1000000, %1\n" \
953 "sbbl %0, %0\n" \
954 "cmpl $0x10000, %1\n" \
955 "sbbl $0, %0\n" \
956 "cmpl $0x100, %1\n" \
957 "sbbl $0, %0\n" \
958 : "=&r" (__shift) : "r" (__n)); \
959 __shift = __shift*8 + 24 + 1; \
960 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
961 } while (0)
962 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
963 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
964
965 #else /* ! pentiummmx || LONGLONG_STANDALONE */
966 /* The following should be a fixed 14 cycles or so. Some scheduling
967 opportunities should be available between the float load/store too. This
968 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
969 apparently suggested by the Intel optimizing manual (don't know exactly
970 where). gcc 2.95 or up will be best for this, so the "double" is
971 correctly aligned on the stack. */
972 #define count_leading_zeros(c,n) \
973 do { \
974 union { \
975 double d; \
976 unsigned a[2]; \
977 } __u; \
978 __u.d = (UWtype) (n); \
979 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
980 } while (0)
981 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
982 #endif /* pentiummx */
983
984 #else /* ! pentium */
985
986 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
987 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
988 #endif /* gcc clz */
989
990 /* On P6, gcc prior to 3.0 generates a partial register stall for
991 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
992 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
993 cost of one extra instruction. Do this for "i386" too, since that means
994 generic x86. */
995 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \
996 && (HAVE_HOST_CPU_i386 \
997 || HAVE_HOST_CPU_i686 \
998 || HAVE_HOST_CPU_pentiumpro \
999 || HAVE_HOST_CPU_pentium2 \
1000 || HAVE_HOST_CPU_pentium3)
1001 #define count_leading_zeros(count, x) \
1002 do { \
1003 USItype __cbtmp; \
1004 ASSERT ((x) != 0); \
1005 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
1006 (count) = 31 - __cbtmp; \
1007 } while (0)
1008 #endif /* gcc<3 asm bsrl */
1009
1010 #ifndef count_leading_zeros
1011 #define count_leading_zeros(count, x) \
1012 do { \
1013 USItype __cbtmp; \
1014 ASSERT ((x) != 0); \
1015 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
1016 (count) = __cbtmp ^ 31; \
1017 } while (0)
1018 #endif /* asm bsrl */
1019
1020 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
1021 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
1022 #endif /* gcc ctz */
1023
1024 #ifndef count_trailing_zeros
1025 #define count_trailing_zeros(count, x) \
1026 do { \
1027 ASSERT ((x) != 0); \
1028 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
1029 } while (0)
1030 #endif /* asm bsfl */
1031
1032 #endif /* ! pentium */
1033
1034 #endif /* 80x86 */
1035
1036 #if defined (__amd64__) && W_TYPE_SIZE == 64
1037 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1038 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
1039 : "=r" (sh), "=&r" (sl) \
1040 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1041 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1042 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1043 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
1044 : "=r" (sh), "=&r" (sl) \
1045 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1046 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1047 #if X86_ASM_MULX \
1048 && (HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell \
1049 || HAVE_HOST_CPU_skylake || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen)
1050 #define umul_ppmm(w1, w0, u, v) \
1051 __asm__ ("mulx\t%3, %q0, %q1" \
1052 : "=r" (w0), "=r" (w1) \
1053 : "%d" ((UDItype)(u)), "rm" ((UDItype)(v)))
1054 #else
1055 #define umul_ppmm(w1, w0, u, v) \
1056 __asm__ ("mulq\t%3" \
1057 : "=a" (w0), "=d" (w1) \
1058 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1059 #endif
1060 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1061 __asm__ ("divq %4" /* stringification in K&R C */ \
1062 : "=a" (q), "=d" (r) \
1063 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1064
1065 #if HAVE_HOST_CPU_haswell || HAVE_HOST_CPU_broadwell || HAVE_HOST_CPU_skylake \
1066 || HAVE_HOST_CPU_k10 || HAVE_HOST_CPU_bd1 || HAVE_HOST_CPU_bd2 \
1067 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 || HAVE_HOST_CPU_zen \
1068 || HAVE_HOST_CPU_bobcat || HAVE_HOST_CPU_jaguar
1069 #define count_leading_zeros(count, x) \
1070 do { \
1071 /* This is lzcnt, spelled for older assemblers. Destination and */ \
1072 /* source must be a 64-bit registers, hence cast and %q. */ \
1073 __asm__ ("rep;bsr\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1074 } while (0)
1075 #define COUNT_LEADING_ZEROS_0 64
1076 #else
1077 #define count_leading_zeros(count, x) \
1078 do { \
1079 UDItype __cbtmp; \
1080 ASSERT ((x) != 0); \
1081 __asm__ ("bsr\t%1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
1082 (count) = __cbtmp ^ 63; \
1083 } while (0)
1084 #endif
1085
1086 #if HAVE_HOST_CPU_bd2 || HAVE_HOST_CPU_bd3 || HAVE_HOST_CPU_bd4 \
1087 || HAVE_HOST_CPU_zen || HAVE_HOST_CPU_jaguar
1088 #define count_trailing_zeros(count, x) \
1089 do { \
1090 /* This is tzcnt, spelled for older assemblers. Destination and */ \
1091 /* source must be a 64-bit registers, hence cast and %q. */ \
1092 __asm__ ("rep;bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1093 } while (0)
1094 #define COUNT_TRAILING_ZEROS_0 64
1095 #else
1096 #define count_trailing_zeros(count, x) \
1097 do { \
1098 ASSERT ((x) != 0); \
1099 __asm__ ("bsf\t%1, %q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1100 } while (0)
1101 #endif
1102 #endif /* __amd64__ */
1103
1104 #if defined (__i860__) && W_TYPE_SIZE == 32
1105 #define rshift_rhlc(r,h,l,c) \
1106 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
1107 "=r" (r) : "r" (h), "r" (l), "rn" (c))
1108 #endif /* i860 */
1109
1110 #if defined (__i960__) && W_TYPE_SIZE == 32
1111 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1112 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
1113 : "=r" (sh), "=&r" (sl) \
1114 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1115 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1116 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
1117 : "=r" (sh), "=&r" (sl) \
1118 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1119 #define umul_ppmm(w1, w0, u, v) \
1120 ({union {UDItype __ll; \
1121 struct {USItype __l, __h;} __i; \
1122 } __x; \
1123 __asm__ ("emul %2,%1,%0" \
1124 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
1125 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1126 #define __umulsidi3(u, v) \
1127 ({UDItype __w; \
1128 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
1129 __w; })
1130 #define udiv_qrnnd(q, r, nh, nl, d) \
1131 do { \
1132 union {UDItype __ll; \
1133 struct {USItype __l, __h;} __i; \
1134 } __nn; \
1135 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
1136 __asm__ ("ediv %d,%n,%0" \
1137 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
1138 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
1139 } while (0)
1140 #define count_leading_zeros(count, x) \
1141 do { \
1142 USItype __cbtmp; \
1143 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
1144 (count) = __cbtmp ^ 31; \
1145 } while (0)
1146 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1147 #if defined (__i960mx) /* what is the proper symbol to test??? */
1148 #define rshift_rhlc(r,h,l,c) \
1149 do { \
1150 union {UDItype __ll; \
1151 struct {USItype __l, __h;} __i; \
1152 } __nn; \
1153 __nn.__i.__h = (h); __nn.__i.__l = (l); \
1154 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
1155 }
1156 #endif /* i960mx */
1157 #endif /* i960 */
1158
1159
1160 #if defined (__loongarch64) && W_TYPE_SIZE == 64
1161 #define umul_ppmm(w1, w0, u, v) \
1162 do { \
1163 UDItype __u = (u), __v = (v); \
1164 (w0) = __u * __v; \
1165 (w1) = (unsigned __int128__) __u * __v >> 64; \
1166 } while (0)
1167 #endif
1168
1169
1170 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1171 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1172 || defined (__mc5307__)) && W_TYPE_SIZE == 32
1173 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1174 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
1175 : "=d" (sh), "=&d" (sl) \
1176 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1177 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1178 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1179 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
1180 : "=d" (sh), "=&d" (sl) \
1181 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1182 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1183 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
1184 #if defined (__mc68020__) || defined(mc68020) \
1185 || defined (__mc68030__) || defined (mc68030) \
1186 || defined (__mc68040__) || defined (mc68040) \
1187 || defined (__mcpu32__) || defined (mcpu32) \
1188 || defined (__NeXT__)
1189 #define umul_ppmm(w1, w0, u, v) \
1190 __asm__ ("mulu%.l %3,%1:%0" \
1191 : "=d" (w0), "=d" (w1) \
1192 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1193 #define udiv_qrnnd(q, r, n1, n0, d) \
1194 __asm__ ("divu%.l %4,%1:%0" \
1195 : "=d" (q), "=d" (r) \
1196 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1197 #define sdiv_qrnnd(q, r, n1, n0, d) \
1198 __asm__ ("divs%.l %4,%1:%0" \
1199 : "=d" (q), "=d" (r) \
1200 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1201 #else /* for other 68k family members use 16x16->32 multiplication */
1202 #define umul_ppmm(xh, xl, a, b) \
1203 do { USItype __umul_tmp1, __umul_tmp2; \
1204 __asm__ ("| Inlined umul_ppmm\n" \
1205 " move%.l %5,%3\n" \
1206 " move%.l %2,%0\n" \
1207 " move%.w %3,%1\n" \
1208 " swap %3\n" \
1209 " swap %0\n" \
1210 " mulu%.w %2,%1\n" \
1211 " mulu%.w %3,%0\n" \
1212 " mulu%.w %2,%3\n" \
1213 " swap %2\n" \
1214 " mulu%.w %5,%2\n" \
1215 " add%.l %3,%2\n" \
1216 " jcc 1f\n" \
1217 " add%.l %#0x10000,%0\n" \
1218 "1: move%.l %2,%3\n" \
1219 " clr%.w %2\n" \
1220 " swap %2\n" \
1221 " swap %3\n" \
1222 " clr%.w %3\n" \
1223 " add%.l %3,%1\n" \
1224 " addx%.l %2,%0\n" \
1225 " | End inlined umul_ppmm" \
1226 : "=&d" (xh), "=&d" (xl), \
1227 "=&d" (__umul_tmp1), "=&d" (__umul_tmp2) \
1228 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
1229 } while (0)
1230 #endif /* not mc68020 */
1231 /* The '020, '030, '040 and '060 have bitfield insns.
1232 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1233 exclude bfffo on that chip (bitfield insns not available). */
1234 #if (defined (__mc68020__) || defined (mc68020) \
1235 || defined (__mc68030__) || defined (mc68030) \
1236 || defined (__mc68040__) || defined (mc68040) \
1237 || defined (__mc68060__) || defined (mc68060) \
1238 || defined (__NeXT__)) \
1239 && ! defined (__mcpu32__)
1240 #define count_leading_zeros(count, x) \
1241 __asm__ ("bfffo %1{%b2:%b2},%0" \
1242 : "=d" (count) \
1243 : "od" ((USItype) (x)), "n" (0))
1244 #define COUNT_LEADING_ZEROS_0 32
1245 #endif
1246 #endif /* mc68000 */
1247
1248 #if defined (__m88000__) && W_TYPE_SIZE == 32
1249 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1250 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
1251 : "=r" (sh), "=&r" (sl) \
1252 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1253 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1254 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
1255 : "=r" (sh), "=&r" (sl) \
1256 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1257 #define count_leading_zeros(count, x) \
1258 do { \
1259 USItype __cbtmp; \
1260 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
1261 (count) = __cbtmp ^ 31; \
1262 } while (0)
1263 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1264 #if defined (__m88110__)
1265 #define umul_ppmm(wh, wl, u, v) \
1266 do { \
1267 union {UDItype __ll; \
1268 struct {USItype __h, __l;} __i; \
1269 } __x; \
1270 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
1271 (wh) = __x.__i.__h; \
1272 (wl) = __x.__i.__l; \
1273 } while (0)
1274 #define udiv_qrnnd(q, r, n1, n0, d) \
1275 ({union {UDItype __ll; \
1276 struct {USItype __h, __l;} __i; \
1277 } __x, __q; \
1278 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1279 __asm__ ("divu.d %0,%1,%2" \
1280 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1281 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1282 #endif /* __m88110__ */
1283 #endif /* __m88000__ */
1284
1285 #if defined (__mips) && W_TYPE_SIZE == 32
1286 #if __GMP_GNUC_PREREQ (4,4)
1287 #define umul_ppmm(w1, w0, u, v) \
1288 do { \
1289 UDItype __ll = (UDItype)(u) * (v); \
1290 w1 = __ll >> 32; \
1291 w0 = __ll; \
1292 } while (0)
1293 #endif
1294 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1295 #define umul_ppmm(w1, w0, u, v) \
1296 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1297 #endif
1298 #if !defined (umul_ppmm)
1299 #define umul_ppmm(w1, w0, u, v) \
1300 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1301 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1302 #endif
1303 #endif /* __mips */
1304
1305 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1306 #if defined (_MIPS_ARCH_MIPS64R6)
1307 #define umul_ppmm(w1, w0, u, v) \
1308 do { \
1309 UDItype __m0 = (u), __m1 = (v); \
1310 (w0) = __m0 * __m1; \
1311 __asm__ ("dmuhu\t%0, %1, %2" : "=d" (w1) : "d" (__m0), "d" (__m1)); \
1312 } while (0)
1313 #endif
1314 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (4,4)
1315 #define umul_ppmm(w1, w0, u, v) \
1316 do { \
1317 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1318 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1319 w1 = __ll >> 64; \
1320 w0 = __ll; \
1321 } while (0)
1322 #endif
1323 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7) && !defined (__clang__)
1324 #define umul_ppmm(w1, w0, u, v) \
1325 __asm__ ("dmultu %2,%3" \
1326 : "=l" (w0), "=h" (w1) \
1327 : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1328 #endif
1329 #if !defined (umul_ppmm)
1330 #define umul_ppmm(w1, w0, u, v) \
1331 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1332 : "=d" (w0), "=d" (w1) \
1333 : "d" ((UDItype)(u)), "d" ((UDItype)(v)))
1334 #endif
1335 #endif /* __mips */
1336
1337 #if defined (__mmix__) && W_TYPE_SIZE == 64
1338 #define umul_ppmm(w1, w0, u, v) \
1339 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1340 #endif
1341
1342 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1343 #define umul_ppmm(w1, w0, u, v) \
1344 ({union {UDItype __ll; \
1345 struct {USItype __l, __h;} __i; \
1346 } __x; \
1347 __asm__ ("meid %2,%0" \
1348 : "=g" (__x.__ll) \
1349 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1350 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1351 #define __umulsidi3(u, v) \
1352 ({UDItype __w; \
1353 __asm__ ("meid %2,%0" \
1354 : "=g" (__w) \
1355 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1356 __w; })
1357 #define udiv_qrnnd(q, r, n1, n0, d) \
1358 ({union {UDItype __ll; \
1359 struct {USItype __l, __h;} __i; \
1360 } __x; \
1361 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1362 __asm__ ("deid %2,%0" \
1363 : "=g" (__x.__ll) \
1364 : "0" (__x.__ll), "g" ((USItype)(d))); \
1365 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1366 #define count_trailing_zeros(count,x) \
1367 do { \
1368 __asm__ ("ffsd %2,%0" \
1369 : "=r" (count) \
1370 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1371 } while (0)
1372 #endif /* __ns32000__ */
1373
1374 /* In the past we had a block of various #defines tested
1375 _ARCH_PPC - AIX
1376 _ARCH_PWR - AIX
1377 __powerpc__ - gcc
1378 __POWERPC__ - BEOS
1379 __ppc__ - Darwin
1380 PPC - old gcc, GNU/Linux, SysV
1381 The plain PPC test was not good for vxWorks, since PPC is defined on all
1382 CPUs there (eg. m68k too), as a constant one is expected to compare
1383 CPU_FAMILY against.
1384
1385 At any rate, this was pretty unattractive and a bit fragile. The use of
1386 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1387 getting the desired effect.
1388
1389 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1390 the system vendor compilers. (Is that vendor compilers with inline asm,
1391 or what?) */
1392
1393 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1394 && W_TYPE_SIZE == 32
1395 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1396 do { \
1397 if (__builtin_constant_p (bh) && (bh) == 0) \
1398 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1399 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
1400 __CLOBBER_CC); \
1401 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1402 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1403 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl) \
1404 __CLOBBER_CC); \
1405 else \
1406 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1407 : "=r" (sh), "=&r" (sl) \
1408 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl) \
1409 __CLOBBER_CC); \
1410 } while (0)
1411 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1412 do { \
1413 if (__builtin_constant_p (ah) && (ah) == 0) \
1414 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1415 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
1416 __CLOBBER_CC); \
1417 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1418 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1419 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl) \
1420 __CLOBBER_CC); \
1421 else if (__builtin_constant_p (bh) && (bh) == 0) \
1422 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1423 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
1424 __CLOBBER_CC); \
1425 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1426 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1427 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl) \
1428 __CLOBBER_CC); \
1429 else \
1430 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1431 : "=r" (sh), "=&r" (sl) \
1432 : "r" (ah), "r" (bh), "rI" (al), "r" (bl) \
1433 __CLOBBER_CC); \
1434 } while (0)
1435 #define count_leading_zeros(count, x) \
1436 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1437 #define COUNT_LEADING_ZEROS_0 32
1438 #if HAVE_HOST_CPU_FAMILY_powerpc
1439 #if __GMP_GNUC_PREREQ (4,4)
1440 #define umul_ppmm(w1, w0, u, v) \
1441 do { \
1442 UDItype __ll = (UDItype)(u) * (v); \
1443 w1 = __ll >> 32; \
1444 w0 = __ll; \
1445 } while (0)
1446 #endif
1447 #if !defined (umul_ppmm)
1448 #define umul_ppmm(ph, pl, m0, m1) \
1449 do { \
1450 USItype __m0 = (m0), __m1 = (m1); \
1451 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1452 (pl) = __m0 * __m1; \
1453 } while (0)
1454 #endif
1455 #define smul_ppmm(ph, pl, m0, m1) \
1456 do { \
1457 SItype __m0 = (m0), __m1 = (m1); \
1458 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1459 (pl) = __m0 * __m1; \
1460 } while (0)
1461 #else
1462 #define smul_ppmm(xh, xl, m0, m1) \
1463 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1464 #define sdiv_qrnnd(q, r, nh, nl, d) \
1465 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1466 #endif
1467 #endif /* 32-bit POWER architecture variants. */
1468
1469 /* We should test _IBMR2 here when we add assembly support for the system
1470 vendor compilers. */
1471 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1472 #if !defined (_LONG_LONG_LIMB)
1473 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1474 use adde etc only when not _LONG_LONG_LIMB. */
1475 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1476 do { \
1477 if (__builtin_constant_p (bh) && (bh) == 0) \
1478 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1479 : "=r" (sh), "=&r" (sl) \
1480 : "r" ((UDItype)(ah)), \
1481 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1482 __CLOBBER_CC); \
1483 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1484 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1485 : "=r" (sh), "=&r" (sl) \
1486 : "r" ((UDItype)(ah)), \
1487 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1488 __CLOBBER_CC); \
1489 else \
1490 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1491 : "=r" (sh), "=&r" (sl) \
1492 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1493 "%r" ((UDItype)(al)), "rI" ((UDItype)(bl)) \
1494 __CLOBBER_CC); \
1495 } while (0)
1496 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1497 This might seem strange, but gcc folds away the dead code late. */
1498 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1499 do { \
1500 if (__builtin_constant_p (bl) \
1501 && (bl) > -0x8000 && (bl) <= 0x8000 && (bl) != 0) { \
1502 if (__builtin_constant_p (ah) && (ah) == 0) \
1503 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \
1504 : "=r" (sh), "=&r" (sl) \
1505 : "r" ((UDItype)(bh)), \
1506 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1507 __CLOBBER_CC); \
1508 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1509 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \
1510 : "=r" (sh), "=&r" (sl) \
1511 : "r" ((UDItype)(bh)), \
1512 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1513 __CLOBBER_CC); \
1514 else if (__builtin_constant_p (bh) && (bh) == 0) \
1515 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \
1516 : "=r" (sh), "=&r" (sl) \
1517 : "r" ((UDItype)(ah)), \
1518 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1519 __CLOBBER_CC); \
1520 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1521 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \
1522 : "=r" (sh), "=&r" (sl) \
1523 : "r" ((UDItype)(ah)), \
1524 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1525 __CLOBBER_CC); \
1526 else \
1527 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \
1528 : "=r" (sh), "=&r" (sl) \
1529 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1530 "r" ((UDItype)(al)), "*rI" (-((UDItype)(bl))) \
1531 __CLOBBER_CC); \
1532 } else { \
1533 if (__builtin_constant_p (ah) && (ah) == 0) \
1534 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1535 : "=r" (sh), "=&r" (sl) \
1536 : "r" ((UDItype)(bh)), \
1537 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1538 __CLOBBER_CC); \
1539 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1540 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1541 : "=r" (sh), "=&r" (sl) \
1542 : "r" ((UDItype)(bh)), \
1543 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1544 __CLOBBER_CC); \
1545 else if (__builtin_constant_p (bh) && (bh) == 0) \
1546 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1547 : "=r" (sh), "=&r" (sl) \
1548 : "r" ((UDItype)(ah)), \
1549 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1550 __CLOBBER_CC); \
1551 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1552 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1553 : "=r" (sh), "=&r" (sl) \
1554 : "r" ((UDItype)(ah)), \
1555 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1556 __CLOBBER_CC); \
1557 else \
1558 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1559 : "=r" (sh), "=&r" (sl) \
1560 : "r" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
1561 "rI" ((UDItype)(al)), "r" ((UDItype)(bl)) \
1562 __CLOBBER_CC); \
1563 } \
1564 } while (0)
1565 #endif /* ! _LONG_LONG_LIMB */
1566 #define count_leading_zeros(count, x) \
1567 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1568 #define COUNT_LEADING_ZEROS_0 64
1569 #if __GMP_GNUC_PREREQ (4,8)
1570 #define umul_ppmm(w1, w0, u, v) \
1571 do { \
1572 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1573 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1574 w1 = __ll >> 64; \
1575 w0 = __ll; \
1576 } while (0)
1577 #endif
1578 #if !defined (umul_ppmm)
1579 #define umul_ppmm(ph, pl, m0, m1) \
1580 do { \
1581 UDItype __m0 = (m0), __m1 = (m1); \
1582 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
1583 (pl) = __m0 * __m1; \
1584 } while (0)
1585 #endif
1586 #define smul_ppmm(ph, pl, m0, m1) \
1587 do { \
1588 DItype __m0 = (m0), __m1 = (m1); \
1589 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (__m0), "r" (__m1)); \
1590 (pl) = __m0 * __m1; \
1591 } while (0)
1592 #endif /* 64-bit PowerPC. */
1593
1594 #if defined (__pyr__) && W_TYPE_SIZE == 32
1595 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1596 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1597 : "=r" (sh), "=&r" (sl) \
1598 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1599 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1600 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1601 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1602 : "=r" (sh), "=&r" (sl) \
1603 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1604 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1605 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1606 #define umul_ppmm(w1, w0, u, v) \
1607 ({union {UDItype __ll; \
1608 struct {USItype __h, __l;} __i; \
1609 } __x; \
1610 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1611 : "=&r" (__x.__ll) \
1612 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1613 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1614 #endif /* __pyr__ */
1615
1616 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1617 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1618 __asm__ ("a %1,%5\n\tae %0,%3" \
1619 : "=r" (sh), "=&r" (sl) \
1620 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1621 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1622 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1623 __asm__ ("s %1,%5\n\tse %0,%3" \
1624 : "=r" (sh), "=&r" (sl) \
1625 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1626 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1627 #define smul_ppmm(ph, pl, m0, m1) \
1628 __asm__ ( \
1629 "s r2,r2\n" \
1630 " mts r10,%2\n" \
1631 " m r2,%3\n" \
1632 " m r2,%3\n" \
1633 " m r2,%3\n" \
1634 " m r2,%3\n" \
1635 " m r2,%3\n" \
1636 " m r2,%3\n" \
1637 " m r2,%3\n" \
1638 " m r2,%3\n" \
1639 " m r2,%3\n" \
1640 " m r2,%3\n" \
1641 " m r2,%3\n" \
1642 " m r2,%3\n" \
1643 " m r2,%3\n" \
1644 " m r2,%3\n" \
1645 " m r2,%3\n" \
1646 " m r2,%3\n" \
1647 " cas %0,r2,r0\n" \
1648 " mfs r10,%1" \
1649 : "=r" (ph), "=r" (pl) \
1650 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1651 : "r2")
1652 #define count_leading_zeros(count, x) \
1653 do { \
1654 if ((x) >= 0x10000) \
1655 __asm__ ("clz %0,%1" \
1656 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1657 else \
1658 { \
1659 __asm__ ("clz %0,%1" \
1660 : "=r" (count) : "r" ((USItype)(x))); \
1661 (count) += 16; \
1662 } \
1663 } while (0)
1664 #endif /* RT/ROMP */
1665
1666 #if defined (__riscv) && defined (__riscv_mul) && W_TYPE_SIZE == 64
1667 #define umul_ppmm(ph, pl, u, v) \
1668 do { \
1669 UDItype __u = (u), __v = (v); \
1670 (pl) = __u * __v; \
1671 __asm__ ("mulhu\t%0, %1, %2" : "=r" (ph) : "%r" (__u), "r" (__v)); \
1672 } while (0)
1673 #endif
1674
1675 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1676 #define umul_ppmm(w1, w0, u, v) \
1677 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1678 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1679 #endif
1680
1681 #if defined (__sparc__) && W_TYPE_SIZE == 32
1682 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1683 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1684 : "=r" (sh), "=&r" (sl) \
1685 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1686 __CLOBBER_CC)
1687 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1688 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1689 : "=r" (sh), "=&r" (sl) \
1690 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1691 __CLOBBER_CC)
1692 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1693 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1694 #if defined (__sparc_v9__) || defined (__sparcv9)
1695 /* Perhaps we should use floating-point operations here? */
1696 #if 0
1697 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1698 Perhaps we simply need explicitly zero-extend the inputs? */
1699 #define umul_ppmm(w1, w0, u, v) \
1700 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1701 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1702 #else
1703 /* Use v8 umul until above bug is fixed. */
1704 #define umul_ppmm(w1, w0, u, v) \
1705 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1706 #endif
1707 /* Use a plain v8 divide for v9. */
1708 #define udiv_qrnnd(q, r, n1, n0, d) \
1709 do { \
1710 USItype __q; \
1711 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1712 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1713 (r) = (n0) - __q * (d); \
1714 (q) = __q; \
1715 } while (0)
1716 #else
1717 #if defined (__sparc_v8__) /* gcc normal */ \
1718 || defined (__sparcv8) /* gcc solaris */ \
1719 || HAVE_HOST_CPU_supersparc
1720 /* Don't match immediate range because, 1) it is not often useful,
1721 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1722 while we want to match a 13 bit interval, sign extended to 32 bits,
1723 but INTERPRETED AS UNSIGNED. */
1724 #define umul_ppmm(w1, w0, u, v) \
1725 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1726
1727 #if HAVE_HOST_CPU_supersparc
1728 #else
1729 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1730 dividends and will trap to the kernel for the rest. */
1731 #define udiv_qrnnd(q, r, n1, n0, d) \
1732 do { \
1733 USItype __q; \
1734 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1735 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1736 (r) = (n0) - __q * (d); \
1737 (q) = __q; \
1738 } while (0)
1739 #endif /* HAVE_HOST_CPU_supersparc */
1740
1741 #else /* ! __sparc_v8__ */
1742 #if defined (__sparclite__)
1743 /* This has hardware multiply but not divide. It also has two additional
1744 instructions scan (ffs from high bit) and divscc. */
1745 #define umul_ppmm(w1, w0, u, v) \
1746 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1747 #define udiv_qrnnd(q, r, n1, n0, d) \
1748 __asm__ ("! Inlined udiv_qrnnd\n" \
1749 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1750 " tst %%g0\n" \
1751 " divscc %3,%4,%%g1\n" \
1752 " divscc %%g1,%4,%%g1\n" \
1753 " divscc %%g1,%4,%%g1\n" \
1754 " divscc %%g1,%4,%%g1\n" \
1755 " divscc %%g1,%4,%%g1\n" \
1756 " divscc %%g1,%4,%%g1\n" \
1757 " divscc %%g1,%4,%%g1\n" \
1758 " divscc %%g1,%4,%%g1\n" \
1759 " divscc %%g1,%4,%%g1\n" \
1760 " divscc %%g1,%4,%%g1\n" \
1761 " divscc %%g1,%4,%%g1\n" \
1762 " divscc %%g1,%4,%%g1\n" \
1763 " divscc %%g1,%4,%%g1\n" \
1764 " divscc %%g1,%4,%%g1\n" \
1765 " divscc %%g1,%4,%%g1\n" \
1766 " divscc %%g1,%4,%%g1\n" \
1767 " divscc %%g1,%4,%%g1\n" \
1768 " divscc %%g1,%4,%%g1\n" \
1769 " divscc %%g1,%4,%%g1\n" \
1770 " divscc %%g1,%4,%%g1\n" \
1771 " divscc %%g1,%4,%%g1\n" \
1772 " divscc %%g1,%4,%%g1\n" \
1773 " divscc %%g1,%4,%%g1\n" \
1774 " divscc %%g1,%4,%%g1\n" \
1775 " divscc %%g1,%4,%%g1\n" \
1776 " divscc %%g1,%4,%%g1\n" \
1777 " divscc %%g1,%4,%%g1\n" \
1778 " divscc %%g1,%4,%%g1\n" \
1779 " divscc %%g1,%4,%%g1\n" \
1780 " divscc %%g1,%4,%%g1\n" \
1781 " divscc %%g1,%4,%%g1\n" \
1782 " divscc %%g1,%4,%0\n" \
1783 " rd %%y,%1\n" \
1784 " bl,a 1f\n" \
1785 " add %1,%4,%1\n" \
1786 "1: ! End of inline udiv_qrnnd" \
1787 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1788 : "%g1" __AND_CLOBBER_CC)
1789 #define count_leading_zeros(count, x) \
1790 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1791 /* Early sparclites return 63 for an argument of 0, but they warn that future
1792 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1793 undefined. */
1794 #endif /* __sparclite__ */
1795 #endif /* __sparc_v8__ */
1796 #endif /* __sparc_v9__ */
1797 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1798 #ifndef umul_ppmm
1799 #define umul_ppmm(w1, w0, u, v) \
1800 __asm__ ("! Inlined umul_ppmm\n" \
1801 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1802 " sra %3,31,%%g2 ! Don't move this insn\n" \
1803 " and %2,%%g2,%%g2 ! Don't move this insn\n" \
1804 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1805 " mulscc %%g1,%3,%%g1\n" \
1806 " mulscc %%g1,%3,%%g1\n" \
1807 " mulscc %%g1,%3,%%g1\n" \
1808 " mulscc %%g1,%3,%%g1\n" \
1809 " mulscc %%g1,%3,%%g1\n" \
1810 " mulscc %%g1,%3,%%g1\n" \
1811 " mulscc %%g1,%3,%%g1\n" \
1812 " mulscc %%g1,%3,%%g1\n" \
1813 " mulscc %%g1,%3,%%g1\n" \
1814 " mulscc %%g1,%3,%%g1\n" \
1815 " mulscc %%g1,%3,%%g1\n" \
1816 " mulscc %%g1,%3,%%g1\n" \
1817 " mulscc %%g1,%3,%%g1\n" \
1818 " mulscc %%g1,%3,%%g1\n" \
1819 " mulscc %%g1,%3,%%g1\n" \
1820 " mulscc %%g1,%3,%%g1\n" \
1821 " mulscc %%g1,%3,%%g1\n" \
1822 " mulscc %%g1,%3,%%g1\n" \
1823 " mulscc %%g1,%3,%%g1\n" \
1824 " mulscc %%g1,%3,%%g1\n" \
1825 " mulscc %%g1,%3,%%g1\n" \
1826 " mulscc %%g1,%3,%%g1\n" \
1827 " mulscc %%g1,%3,%%g1\n" \
1828 " mulscc %%g1,%3,%%g1\n" \
1829 " mulscc %%g1,%3,%%g1\n" \
1830 " mulscc %%g1,%3,%%g1\n" \
1831 " mulscc %%g1,%3,%%g1\n" \
1832 " mulscc %%g1,%3,%%g1\n" \
1833 " mulscc %%g1,%3,%%g1\n" \
1834 " mulscc %%g1,%3,%%g1\n" \
1835 " mulscc %%g1,%3,%%g1\n" \
1836 " mulscc %%g1,%3,%%g1\n" \
1837 " mulscc %%g1,0,%%g1\n" \
1838 " add %%g1,%%g2,%0\n" \
1839 " rd %%y,%1" \
1840 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1841 : "%g1", "%g2" __AND_CLOBBER_CC)
1842 #endif
1843 #ifndef udiv_qrnnd
1844 #ifndef LONGLONG_STANDALONE
1845 #define udiv_qrnnd(q, r, n1, n0, d) \
1846 do { UWtype __r; \
1847 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1848 (r) = __r; \
1849 } while (0)
1850 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1851 #endif /* LONGLONG_STANDALONE */
1852 #endif /* udiv_qrnnd */
1853 #endif /* __sparc__ */
1854
1855 #if defined (__sparc__) && W_TYPE_SIZE == 64
1856 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1857 __asm__ ( \
1858 "addcc %r4,%5,%1\n" \
1859 " addccc %r6,%7,%%g0\n" \
1860 " addc %r2,%3,%0" \
1861 : "=r" (sh), "=&r" (sl) \
1862 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
1863 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
1864 "%rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
1865 __CLOBBER_CC)
1866 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1867 __asm__ ( \
1868 "subcc %r4,%5,%1\n" \
1869 " subccc %r6,%7,%%g0\n" \
1870 " subc %r2,%3,%0" \
1871 : "=r" (sh), "=&r" (sl) \
1872 : "rJ" ((UDItype)(ah)), "rI" ((UDItype)(bh)), \
1873 "rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)), \
1874 "rJ" ((UDItype)(al) >> 32), "rI" ((UDItype)(bl) >> 32) \
1875 __CLOBBER_CC)
1876 #if __VIS__ >= 0x300
1877 #undef add_ssaaaa
1878 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1879 __asm__ ( \
1880 "addcc %r4, %5, %1\n" \
1881 " addxc %r2, %r3, %0" \
1882 : "=r" (sh), "=&r" (sl) \
1883 : "rJ" ((UDItype)(ah)), "rJ" ((UDItype)(bh)), \
1884 "%rJ" ((UDItype)(al)), "rI" ((UDItype)(bl)) __CLOBBER_CC)
1885 #define umul_ppmm(ph, pl, m0, m1) \
1886 do { \
1887 UDItype __m0 = (m0), __m1 = (m1); \
1888 (pl) = __m0 * __m1; \
1889 __asm__ ("umulxhi\t%2, %1, %0" \
1890 : "=r" (ph) \
1891 : "%r" (__m0), "r" (__m1)); \
1892 } while (0)
1893 #define count_leading_zeros(count, x) \
1894 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1895 /* Needed by count_leading_zeros_32 in sparc64.h. */
1896 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
1897 #endif
1898 #endif
1899
1900 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1901 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1902 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1903 : "=g" (sh), "=&g" (sl) \
1904 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1905 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1906 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1907 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1908 : "=g" (sh), "=&g" (sl) \
1909 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1910 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1911 #define smul_ppmm(xh, xl, m0, m1) \
1912 do { \
1913 union {UDItype __ll; \
1914 struct {USItype __l, __h;} __i; \
1915 } __x; \
1916 USItype __m0 = (m0), __m1 = (m1); \
1917 __asm__ ("emul %1,%2,$0,%0" \
1918 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1919 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1920 } while (0)
1921 #define sdiv_qrnnd(q, r, n1, n0, d) \
1922 do { \
1923 union {DItype __ll; \
1924 struct {SItype __l, __h;} __i; \
1925 } __x; \
1926 __x.__i.__h = n1; __x.__i.__l = n0; \
1927 __asm__ ("ediv %3,%2,%0,%1" \
1928 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1929 } while (0)
1930 #if 0
1931 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1932 8800 maybe). */
1933 #define count_trailing_zeros(count,x) \
1934 do { \
1935 __asm__ ("ffs 0, 31, %1, %0" \
1936 : "=g" (count) \
1937 : "g" ((USItype) (x))); \
1938 } while (0)
1939 #endif
1940 #endif /* vax */
1941
1942 #if defined (__z8000__) && W_TYPE_SIZE == 16
1943 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1944 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1945 : "=r" (sh), "=&r" (sl) \
1946 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1947 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1948 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1949 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1950 : "=r" (sh), "=&r" (sl) \
1951 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1952 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1953 #define umul_ppmm(xh, xl, m0, m1) \
1954 do { \
1955 union {long int __ll; \
1956 struct {unsigned int __h, __l;} __i; \
1957 } __x; \
1958 unsigned int __m0 = (m0), __m1 = (m1); \
1959 __asm__ ("mult %S0,%H3" \
1960 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1961 : "%1" (m0), "rQR" (m1)); \
1962 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1963 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1964 + (((signed int) __m1 >> 15) & __m0)); \
1965 } while (0)
1966 #endif /* __z8000__ */
1967
1968 #endif /* __GNUC__ */
1969
1970 #endif /* NO_ASM */
1971
1972
1973 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */
1974 #if !defined (umul_ppmm) && defined (__umulsidi3)
1975 #define umul_ppmm(ph, pl, m0, m1) \
1976 do { \
1977 UDWtype __ll = __umulsidi3 (m0, m1); \
1978 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1979 pl = (UWtype) __ll; \
1980 } while (0)
1981 #endif
1982
1983 #if !defined (__umulsidi3)
1984 #define __umulsidi3(u, v) \
1985 ({UWtype __hi, __lo; \
1986 umul_ppmm (__hi, __lo, u, v); \
1987 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1988 #endif
1989
1990
1991 #if defined (__cplusplus)
1992 #define __longlong_h_C "C"
1993 #else
1994 #define __longlong_h_C
1995 #endif
1996
1997 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1998 forms have "reversed" arguments, meaning the pointer is last, which
1999 sometimes allows better parameter passing, in particular on 64-bit
2000 hppa. */
2001
2002 #define mpn_umul_ppmm __MPN(umul_ppmm)
2003 extern __longlong_h_C UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
2004
2005 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
2006 && ! defined (LONGLONG_STANDALONE)
2007 #define umul_ppmm(wh, wl, u, v) \
2008 do { \
2009 UWtype __umul_ppmm__p0; \
2010 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
2011 (wl) = __umul_ppmm__p0; \
2012 } while (0)
2013 #endif
2014
2015 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
2016 extern __longlong_h_C UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
2017
2018 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
2019 && ! defined (LONGLONG_STANDALONE)
2020 #define umul_ppmm(wh, wl, u, v) \
2021 do { \
2022 UWtype __umul_p0; \
2023 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \
2024 (wl) = __umul_p0; \
2025 } while (0)
2026 #endif
2027
2028 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
2029 extern __longlong_h_C UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
2030
2031 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
2032 && ! defined (LONGLONG_STANDALONE)
2033 #define udiv_qrnnd(q, r, n1, n0, d) \
2034 do { \
2035 UWtype __udiv_qrnnd_r; \
2036 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \
2037 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
2038 (r) = __udiv_qrnnd_r; \
2039 } while (0)
2040 #endif
2041
2042 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
2043 extern __longlong_h_C UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
2044
2045 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
2046 && ! defined (LONGLONG_STANDALONE)
2047 #define udiv_qrnnd(q, r, n1, n0, d) \
2048 do { \
2049 UWtype __udiv_qrnnd_r; \
2050 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
2051 &__udiv_qrnnd_r); \
2052 (r) = __udiv_qrnnd_r; \
2053 } while (0)
2054 #endif
2055
2056
2057 /* If this machine has no inline assembler, use C macros. */
2058
2059 #if !defined (add_ssaaaa)
2060 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
2061 do { \
2062 UWtype __x; \
2063 UWtype __al = (al); \
2064 UWtype __bl = (bl); \
2065 __x = __al + __bl; \
2066 (sh) = (ah) + (bh) + (__x < __al); \
2067 (sl) = __x; \
2068 } while (0)
2069 #endif
2070
2071 #if !defined (sub_ddmmss)
2072 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
2073 do { \
2074 UWtype __x; \
2075 UWtype __al = (al); \
2076 UWtype __bl = (bl); \
2077 __x = __al - __bl; \
2078 (sh) = (ah) - (bh) - (__al < __bl); \
2079 (sl) = __x; \
2080 } while (0)
2081 #endif
2082
2083 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
2084 smul_ppmm. */
2085 #if !defined (umul_ppmm) && defined (smul_ppmm)
2086 #define umul_ppmm(w1, w0, u, v) \
2087 do { \
2088 UWtype __w1; \
2089 UWtype __xm0 = (u), __xm1 = (v); \
2090 smul_ppmm (__w1, w0, __xm0, __xm1); \
2091 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2092 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2093 } while (0)
2094 #endif
2095
2096 /* If we still don't have umul_ppmm, define it using plain C.
2097
2098 For reference, when this code is used for squaring (ie. u and v identical
2099 expressions), gcc recognises __x1 and __x2 are the same and generates 3
2100 multiplies, not 4. The subsequent additions could be optimized a bit,
2101 but the only place GMP currently uses such a square is mpn_sqr_basecase,
2102 and chips obliged to use this generic C umul will have plenty of worse
2103 performance problems than a couple of extra instructions on the diagonal
2104 of sqr_basecase. */
2105
2106 #if !defined (umul_ppmm)
2107 #define umul_ppmm(w1, w0, u, v) \
2108 do { \
2109 UWtype __x0, __x1, __x2, __x3; \
2110 UHWtype __ul, __vl, __uh, __vh; \
2111 UWtype __u = (u), __v = (v); \
2112 \
2113 __ul = __ll_lowpart (__u); \
2114 __uh = __ll_highpart (__u); \
2115 __vl = __ll_lowpart (__v); \
2116 __vh = __ll_highpart (__v); \
2117 \
2118 __x0 = (UWtype) __ul * __vl; \
2119 __x1 = (UWtype) __ul * __vh; \
2120 __x2 = (UWtype) __uh * __vl; \
2121 __x3 = (UWtype) __uh * __vh; \
2122 \
2123 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
2124 __x1 += __x2; /* but this indeed can */ \
2125 if (__x1 < __x2) /* did we get it? */ \
2126 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
2127 \
2128 (w1) = __x3 + __ll_highpart (__x1); \
2129 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
2130 } while (0)
2131 #endif
2132
2133 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2134 exist in one form or another. */
2135 #if !defined (smul_ppmm)
2136 #define smul_ppmm(w1, w0, u, v) \
2137 do { \
2138 UWtype __w1; \
2139 UWtype __xm0 = (u), __xm1 = (v); \
2140 umul_ppmm (__w1, w0, __xm0, __xm1); \
2141 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2142 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2143 } while (0)
2144 #endif
2145
2146 /* Define this unconditionally, so it can be used for debugging. */
2147 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2148 do { \
2149 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
2150 \
2151 ASSERT ((d) != 0); \
2152 ASSERT ((n1) < (d)); \
2153 \
2154 __d1 = __ll_highpart (d); \
2155 __d0 = __ll_lowpart (d); \
2156 \
2157 __q1 = (n1) / __d1; \
2158 __r1 = (n1) - __q1 * __d1; \
2159 __m = __q1 * __d0; \
2160 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
2161 if (__r1 < __m) \
2162 { \
2163 __q1--, __r1 += (d); \
2164 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2165 if (__r1 < __m) \
2166 __q1--, __r1 += (d); \
2167 } \
2168 __r1 -= __m; \
2169 \
2170 __q0 = __r1 / __d1; \
2171 __r0 = __r1 - __q0 * __d1; \
2172 __m = __q0 * __d0; \
2173 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
2174 if (__r0 < __m) \
2175 { \
2176 __q0--, __r0 += (d); \
2177 if (__r0 >= (d)) \
2178 if (__r0 < __m) \
2179 __q0--, __r0 += (d); \
2180 } \
2181 __r0 -= __m; \
2182 \
2183 (q) = __q1 * __ll_B | __q0; \
2184 (r) = __r0; \
2185 } while (0)
2186
2187 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2188 __udiv_w_sdiv (defined in libgcc or elsewhere). */
2189 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) \
2190 && ! defined (LONGLONG_STANDALONE)
2191 #define udiv_qrnnd(q, r, nh, nl, d) \
2192 do { \
2193 UWtype __r; \
2194 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
2195 (r) = __r; \
2196 } while (0)
2197 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2198 #endif
2199
2200 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
2201 #if !defined (udiv_qrnnd)
2202 #define UDIV_NEEDS_NORMALIZATION 1
2203 #define udiv_qrnnd __udiv_qrnnd_c
2204 #endif
2205
2206 #if !defined (count_leading_zeros)
2207 #define count_leading_zeros(count, x) \
2208 do { \
2209 UWtype __xr = (x); \
2210 UWtype __a; \
2211 \
2212 if (W_TYPE_SIZE == 32) \
2213 { \
2214 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
2215 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
2216 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
2217 : 3*__BITS4 + 1); \
2218 } \
2219 else \
2220 { \
2221 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
2222 if (((__xr >> __a) & 0xff) != 0) \
2223 break; \
2224 ++__a; \
2225 } \
2226 \
2227 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
2228 } while (0)
2229 /* This version gives a well-defined value for zero. */
2230 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2231 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2232 #define COUNT_LEADING_ZEROS_SLOW
2233 #endif
2234
2235 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2236 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2237 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2238 #endif
2239
2240 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2241 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2242 #endif
2243
2244 #if !defined (count_trailing_zeros)
2245 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2246 /* Define count_trailing_zeros using an asm count_leading_zeros. */
2247 #define count_trailing_zeros(count, x) \
2248 do { \
2249 UWtype __ctz_x = (x); \
2250 UWtype __ctz_c; \
2251 ASSERT (__ctz_x != 0); \
2252 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
2253 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
2254 } while (0)
2255 #else
2256 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2257 We use clz_tab without ado, since the C count_leading_zeros above will have
2258 pulled it in. */
2259 #define count_trailing_zeros(count, x) \
2260 do { \
2261 UWtype __ctz_x = (x); \
2262 int __ctz_c; \
2263 \
2264 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2265 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \
2266 else \
2267 { \
2268 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \
2269 { \
2270 __ctz_x >>= 8; \
2271 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2272 break; \
2273 } \
2274 \
2275 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \
2276 } \
2277 } while (0)
2278 #endif
2279 #endif
2280
2281 #ifndef UDIV_NEEDS_NORMALIZATION
2282 #define UDIV_NEEDS_NORMALIZATION 0
2283 #endif
2284
2285 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2286 that hence the latter should always be used. */
2287 #ifndef UDIV_PREINV_ALWAYS
2288 #define UDIV_PREINV_ALWAYS 0
2289 #endif