1 /* hgcd2-div.h
2
3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7 Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software
8 Foundation, Inc.
9
10 This file is part of the GNU MP Library.
11
12 The GNU MP Library is free software; you can redistribute it and/or modify
13 it under the terms of either:
14
15 * the GNU Lesser General Public License as published by the Free
16 Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
18
19 or
20
21 * the GNU General Public License as published by the Free Software
22 Foundation; either version 2 of the License, or (at your option) any
23 later version.
24
25 or both in parallel, as here.
26
27 The GNU MP Library is distributed in the hope that it will be useful, but
28 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
29 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
30 for more details.
31
32 You should have received copies of the GNU General Public License and the
33 GNU Lesser General Public License along with the GNU MP Library. If not,
34 see https://www.gnu.org/licenses/. */
35
36 #include "gmp-impl.h"
37 #include "longlong.h"
38
39 #ifndef HGCD2_DIV1_METHOD
40 #define HGCD2_DIV1_METHOD 3
41 #endif
42
43 #ifndef HGCD2_DIV2_METHOD
44 #define HGCD2_DIV2_METHOD 2
45 #endif
46
47 #if HAVE_NATIVE_mpn_div_11
48
49 #define div1 mpn_div_11
50 /* Single-limb division optimized for small quotients.
51 Returned value holds d0 = r, d1 = q. */
52 mp_double_limb_t div1 (mp_limb_t, mp_limb_t);
53
54 #elif HGCD2_DIV1_METHOD == 1
55
56 static inline mp_double_limb_t
57 div1 (mp_limb_t n0, mp_limb_t d0)
58 {
59 mp_double_limb_t res;
60 res.d1 = n0 / d0;
61 res.d0 = n0 - res.d1 * d0;
62
63 return res;
64 }
65
66 #elif HGCD2_DIV1_METHOD == 2
67
68 static mp_double_limb_t
69 div1 (mp_limb_t n0, mp_limb_t d0)
70 {
71 mp_double_limb_t res;
72 int ncnt, dcnt, cnt;
73 mp_limb_t q;
74 mp_limb_t mask;
75
76 ASSERT (n0 >= d0);
77
78 count_leading_zeros (ncnt, n0);
79 count_leading_zeros (dcnt, d0);
80 cnt = dcnt - ncnt;
81
82 d0 <<= cnt;
83
84 q = -(mp_limb_t) (n0 >= d0);
85 n0 -= d0 & q;
86 d0 >>= 1;
87 q = -q;
88
89 while (--cnt >= 0)
90 {
91 mask = -(mp_limb_t) (n0 >= d0);
92 n0 -= d0 & mask;
93 d0 >>= 1;
94 q = (q << 1) - mask;
95 }
96
97 res.d0 = n0;
98 res.d1 = q;
99 return res;
100 }
101
102 #elif HGCD2_DIV1_METHOD == 3
103
104 static inline mp_double_limb_t
105 div1 (mp_limb_t n0, mp_limb_t d0)
106 {
107 mp_double_limb_t res;
108 if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0)
109 || UNLIKELY (n0 >= (d0 << 3)))
110 {
111 res.d1 = n0 / d0;
112 res.d0 = n0 - res.d1 * d0;
113 }
114 else
115 {
116 mp_limb_t q, mask;
117
118 d0 <<= 2;
119
120 mask = -(mp_limb_t) (n0 >= d0);
121 n0 -= d0 & mask;
122 q = 4 & mask;
123
124 d0 >>= 1;
125 mask = -(mp_limb_t) (n0 >= d0);
126 n0 -= d0 & mask;
127 q += 2 & mask;
128
129 d0 >>= 1;
130 mask = -(mp_limb_t) (n0 >= d0);
131 n0 -= d0 & mask;
132 q -= mask;
133
134 res.d0 = n0;
135 res.d1 = q;
136 }
137 return res;
138 }
139
140 #elif HGCD2_DIV1_METHOD == 4
141
142 /* Table quotients. We extract the NBITS most significant bits of the
143 numerator limb, and the corresponding bits from the divisor limb, and use
144 these to form an index into the table. This method is probably only useful
145 for short pipelines with slow multiplication.
146
147 Possible improvements:
148
149 * Perhaps extract the highest NBITS of the divisor instead of the same bits
150 as from the numerator. That would require another count_leading_zeros,
151 and a post-multiply shift of the quotient.
152
153 * Compress tables? Their values are tiny, and there are lots of zero
154 entries (which are never used).
155
156 * Round the table entries more cleverly?
157 */
158
159 #ifndef NBITS
160 #define NBITS 5
161 #endif
162
163 #if NBITS == 5
164 /* This needs full division about 13.2% of the time. */
165 static const unsigned char tab[512] = {
166 17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
167 18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
168 19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
169 20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
170 21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
171 22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
172 23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
173 24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
174 25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
175 26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
176 27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
177 28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
178 29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
179 30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
180 31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
181 32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
182 };
183 #elif NBITS == 6
184 /* This needs full division about 9.8% of the time. */
185 static const unsigned char tab[2048] = {
186 33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
187 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
188 34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
189 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
190 35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
191 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
192 36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
193 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
194 37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
195 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
196 38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
197 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
198 39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
199 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
200 40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,
201 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202 41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
203 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204 42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,
205 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206 43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
207 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
208 44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,
209 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210 45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
211 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212 46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,
213 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214 47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
215 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216 48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,
217 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218 49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
219 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220 50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,
221 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222 51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
223 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
224 52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,
225 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,
226 53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
227 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,
228 54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,
229 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,
230 55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
231 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,
232 56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,
233 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,
234 57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1,
235 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,
236 58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,
237 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
238 59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
239 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
240 60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,
241 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
242 61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
243 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,
244 62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,
245 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,
246 63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1,
247 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,
248 64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,
249 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
250 };
251 #else
252 #error No table for provided NBITS
253 #endif
254
255 /* Doing tabp with a #define makes compiler warnings about pointing outside an
256 object go away. We used to define this as a variable. It is not clear if
257 e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
258 (vector[100] + 10) - 10 surely is and there is no sequence point so the
259 expressions should be equivalent. To make this safe, we might want to
260 define tabp as a macro with the index as an argument. Depending on the
261 platform, relocs might allow for assembly-time or linker-time resolution to
262 take place. */
263 #define tabp (tab - (1 << (NBITS - 1) << NBITS))
264
265 static inline mp_double_limb_t
266 div1 (mp_limb_t n0, mp_limb_t d0)
267 {
268 int ncnt;
269 size_t nbi, dbi;
270 mp_limb_t q0;
271 mp_limb_t r0;
272 mp_limb_t mask;
273 mp_double_limb_t res;
274
275 ASSERT (n0 >= d0); /* Actually only msb position is critical. */
276
277 count_leading_zeros (ncnt, n0);
278 nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS);
279 dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS);
280
281 q0 = tabp[(nbi << NBITS) + dbi];
282 r0 = n0 - q0 * d0;
283 mask = -(mp_limb_t) (r0 >= d0);
284 q0 -= mask;
285 r0 -= d0 & mask;
286
287 if (UNLIKELY (r0 >= d0))
288 {
289 q0 = n0 / d0;
290 r0 = n0 - q0 * d0;
291 }
292
293 res.d1 = q0;
294 res.d0 = r0;
295 return res;
296 }
297
298 #elif HGCD2_DIV1_METHOD == 5
299
300 /* Table inverses of divisors. We don't bother with suppressing the msb from
301 the tables. We index with the NBITS most significant divisor bits,
302 including the always-set highest bit, but use addressing trickery via tabp
303 to suppress it.
304
305 Possible improvements:
306
307 * Do first multiply using 32-bit operations on 64-bit computers. At least
308 on most Arm64 cores, that uses 3 times less resources. It also saves on
309 many x86-64 processors.
310 */
311
312 #ifndef NBITS
313 #define NBITS 7
314 #endif
315
316 #if NBITS == 5
317 /* This needs full division about 1.63% of the time. */
318 static const unsigned char tab[16] = {
319 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32
320 };
321 #elif NBITS == 6
322 /* This needs full division about 0.93% of the time. */
323 static const unsigned char tab[32] = {
324 127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86,
325 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64
326 };
327 #elif NBITS == 7
328 /* This needs full division about 0.49% of the time. */
329 static const unsigned char tab[64] = {
330 255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206,
331 203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171,
332 169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146,
333 145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128
334 };
335 #elif NBITS == 8
336 /* This needs full division about 0.26% of the time. */
337 static const unsigned short tab[128] = {
338 511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457,
339 454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411,
340 408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373,
341 371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342,
342 340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315,
343 314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292,
344 291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273,
345 272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256
346 };
347 #else
348 #error No table for provided NBITS
349 #endif
350
351 /* Doing tabp with a #define makes compiler warnings about pointing outside an
352 object go away. We used to define this as a variable. It is not clear if
353 e.g. (vector[100] - 10) + 10 is well- defined as per the C standard;
354 (vector[100] + 10) - 10 surely is and there is no sequence point so the
355 expressions should be equivalent. To make this safe, we might want to
356 define tabp as a macro with the index as an argument. Depending on the
357 platform, relocs might allow for assembly-time or linker-time resolution to
358 take place. */
359 #define tabp (tab - (1 << (NBITS - 1)))
360
361 static inline mp_double_limb_t
362 div1 (mp_limb_t n0, mp_limb_t d0)
363 {
364 int ncnt, dcnt;
365 size_t dbi;
366 mp_limb_t inv;
367 mp_limb_t q0;
368 mp_limb_t r0;
369 mp_limb_t mask;
370 mp_double_limb_t res;
371
372 count_leading_zeros (ncnt, n0);
373 count_leading_zeros (dcnt, d0);
374
375 dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS);
376 inv = tabp[dbi];
377 q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt);
378 r0 = n0 - q0 * d0;
379 mask = -(mp_limb_t) (r0 >= d0);
380 q0 -= mask;
381 r0 -= d0 & mask;
382
383 if (UNLIKELY (r0 >= d0))
384 {
385 q0 = n0 / d0;
386 r0 = n0 - q0 * d0;
387 }
388
389 res.d1 = q0;
390 res.d0 = r0;
391 return res;
392 }
393
394 #else
395 #error Unknown HGCD2_DIV1_METHOD
396 #endif
397
398 #if HAVE_NATIVE_mpn_div_22
399
400 #define div2 mpn_div_22
401 /* Two-limb division optimized for small quotients. */
402 mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t);
403
404 #elif HGCD2_DIV2_METHOD == 1
405
406 static mp_limb_t
407 div2 (mp_ptr rp,
408 mp_limb_t n1, mp_limb_t n0,
409 mp_limb_t d1, mp_limb_t d0)
410 {
411 mp_double_limb_t rq = div1 (n1, d1);
412 if (UNLIKELY (rq.d1 > d1))
413 {
414 mp_limb_t n2, q, t1, t0;
415 int c;
416
417 /* Normalize */
418 count_leading_zeros (c, d1);
419 ASSERT (c > 0);
420
421 n2 = n1 >> (GMP_LIMB_BITS - c);
422 n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c));
423 n0 <<= c;
424 d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c));
425 d0 <<= c;
426
427 udiv_qrnnd (q, n1, n2, n1, d1);
428 umul_ppmm (t1, t0, q, d0);
429 if (t1 > n1 || (t1 == n1 && t0 > n0))
430 {
431 ASSERT (q > 0);
432 q--;
433 sub_ddmmss (t1, t0, t1, t0, d1, d0);
434 }
435 sub_ddmmss (n1, n0, n1, n0, t1, t0);
436
437 /* Undo normalization */
438 rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c));
439 rp[1] = n1 >> c;
440
441 return q;
442 }
443 else
444 {
445 mp_limb_t q, t1, t0;
446 n1 = rq.d0;
447 q = rq.d1;
448 umul_ppmm (t1, t0, q, d0);
449 if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0))
450 {
451 ASSERT (q > 0);
452 q--;
453 sub_ddmmss (t1, t0, t1, t0, d1, d0);
454 }
455 sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0);
456 return q;
457 }
458 }
459
460 #elif HGCD2_DIV2_METHOD == 2
461
462 /* Bit-wise div2. Relies on fast count_leading_zeros. */
463 static mp_limb_t
464 div2 (mp_ptr rp,
465 mp_limb_t n1, mp_limb_t n0,
466 mp_limb_t d1, mp_limb_t d0)
467 {
468 mp_limb_t q = 0;
469 int ncnt;
470 int dcnt;
471
472 count_leading_zeros (ncnt, n1);
473 count_leading_zeros (dcnt, d1);
474 dcnt -= ncnt;
475
476 d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt));
477 d0 <<= dcnt;
478
479 do
480 {
481 mp_limb_t mask;
482 q <<= 1;
483 if (UNLIKELY (n1 == d1))
484 mask = -(n0 >= d0);
485 else
486 mask = -(n1 > d1);
487
488 q -= mask;
489
490 sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0);
491
492 d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1);
493 d1 = d1 >> 1;
494 }
495 while (dcnt--);
496
497 rp[0] = n0;
498 rp[1] = n1;
499
500 return q;
501 }
502 #else
503 #error Unknown HGCD2_DIV2_METHOD
504 #endif