1 /* mpn_div_qr_1n_pi1
2
3 Contributed to the GNU project by Niels Möller
4
5 THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY
6 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9
10 Copyright 2013 Free Software Foundation, Inc.
11
12 This file is part of the GNU MP Library.
13
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of either:
16
17 * the GNU Lesser General Public License as published by the Free
18 Software Foundation; either version 3 of the License, or (at your
19 option) any later version.
20
21 or
22
23 * the GNU General Public License as published by the Free Software
24 Foundation; either version 2 of the License, or (at your option) any
25 later version.
26
27 or both in parallel, as here.
28
29 The GNU MP Library is distributed in the hope that it will be useful, but
30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
32 for more details.
33
34 You should have received copies of the GNU General Public License and the
35 GNU Lesser General Public License along with the GNU MP Library. If not,
36 see https://www.gnu.org/licenses/. */
37
38 #include "gmp-impl.h"
39 #include "longlong.h"
40
41 #if GMP_NAIL_BITS > 0
42 #error Nail bits not supported
43 #endif
44
45 #ifndef DIV_QR_1N_METHOD
46 #define DIV_QR_1N_METHOD 2
47 #endif
48
49 /* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
50
51 #if defined (__GNUC__) && ! defined (NO_ASM)
52
53 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
54 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
55 __asm__ ( "add %6, %k2\n\t" \
56 "adc %4, %k1\n\t" \
57 "sbb %k0, %k0" \
58 : "=r" (m), "=r" (s1), "=&r" (s0) \
59 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
60 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
61 #endif
62
63 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
64 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
65 __asm__ ( "add %6, %q2\n\t" \
66 "adc %4, %q1\n\t" \
67 "sbb %q0, %q0" \
68 : "=r" (m), "=r" (s1), "=&r" (s0) \
69 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
70 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
71 #endif
72
73 #if defined (__sparc__) && W_TYPE_SIZE == 32
74 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
75 __asm__ ( "addcc %r5, %6, %2\n\t" \
76 "addxcc %r3, %4, %1\n\t" \
77 "subx %%g0, %%g0, %0" \
78 : "=r" (m), "=r" (sh), "=&r" (sl) \
79 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
80 __CLOBBER_CC)
81 #endif
82
83 #if defined (__sparc__) && W_TYPE_SIZE == 64
84 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
85 __asm__ ( "addcc %r5, %6, %2\n\t" \
86 "addccc %r7, %8, %%g0\n\t" \
87 "addccc %r3, %4, %1\n\t" \
88 "clr %0\n\t" \
89 "movcs %%xcc, -1, %0" \
90 : "=r" (m), "=r" (sh), "=&r" (sl) \
91 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
92 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
93 __CLOBBER_CC)
94 #if __VIS__ >= 0x300
95 #undef add_mssaaaa
96 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
97 __asm__ ( "addcc %r5, %6, %2\n\t" \
98 "addxccc %r3, %4, %1\n\t" \
99 "clr %0\n\t" \
100 "movcs %%xcc, -1, %0" \
101 : "=r" (m), "=r" (sh), "=&r" (sl) \
102 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
103 __CLOBBER_CC)
104 #endif
105 #endif
106
107 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
108 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
109 processor running in 32-bit mode, since the carry flag then gets the 32-bit
110 carry. */
111 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
112 __asm__ ( "add%I6c %2, %5, %6\n\t" \
113 "adde %1, %3, %4\n\t" \
114 "subfe %0, %0, %0\n\t" \
115 "nor %0, %0, %0" \
116 : "=r" (m), "=r" (s1), "=&r" (s0) \
117 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \
118 __CLOBBER_CC)
119 #endif
120
121 #if defined (__s390x__) && W_TYPE_SIZE == 64
122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
123 __asm__ ( "algr %2, %6\n\t" \
124 "alcgr %1, %4\n\t" \
125 "lghi %0, 0\n\t" \
126 "alcgr %0, %0\n\t" \
127 "lcgr %0, %0" \
128 : "=r" (m), "=r" (s1), "=&r" (s0) \
129 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
130 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
131 #endif
132
133 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
134 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
135 __asm__ ( "adds %2, %5, %6\n\t" \
136 "adcs %1, %3, %4\n\t" \
137 "movcc %0, #0\n\t" \
138 "movcs %0, #-1" \
139 : "=r" (m), "=r" (sh), "=&r" (sl) \
140 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
141 #endif
142
143 #if defined (__aarch64__) && W_TYPE_SIZE == 64
144 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
145 __asm__ ( "adds %2, %x5, %6\n\t" \
146 "adcs %1, %x3, %x4\n\t" \
147 "csinv %0, xzr, xzr, cc\n\t" \
148 : "=r" (m), "=r" (sh), "=&r" (sl) \
149 : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC)
150 #endif
151 #endif /* defined (__GNUC__) */
152
153 #ifndef add_mssaaaa
154 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
155 do { \
156 UWtype __s0, __s1, __c0, __c1; \
157 __s0 = (a0) + (b0); \
158 __s1 = (a1) + (b1); \
159 __c0 = __s0 < (a0); \
160 __c1 = __s1 < (a1); \
161 (s0) = __s0; \
162 __s1 = __s1 + __c0; \
163 (s1) = __s1; \
164 (m) = - (__c1 + (__s1 < __c0)); \
165 } while (0)
166 #endif
167
168 #if DIV_QR_1N_METHOD == 1
169
170 /* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
171 Requires that uh < d. */
172 mp_limb_t
173 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
174 mp_limb_t d, mp_limb_t dinv)
175 {
176 ASSERT (n > 0);
177 ASSERT (uh < d);
178 ASSERT (d & GMP_NUMB_HIGHBIT);
179 ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
180
181 do
182 {
183 mp_limb_t q, ul;
184
185 ul = up[--n];
186 udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
187 qp[n] = q;
188 }
189 while (n > 0);
190
191 return uh;
192 }
193
194 #elif DIV_QR_1N_METHOD == 2
195
196 /* The main idea of this algorithm is to write B^2 = d (B + dinv) +
197 B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration
198 can then replace
199
200 u1 B^2 = u1 B2 (mod d)
201
202 which gives a very short critical path for computing the remainder
203 (with some tricks to handle the carry when the next two lower limbs
204 are added in). To also get the quotient, include the corresponding
205 multiple of d in the expression,
206
207 u1 B^2 = u1 B2 + (u1 dinv + u1 B) d
208
209 We get the quotient by accumulating the (u1 dinv + u1 B) terms. The
210 two multiplies, u1 * B2 and u1 * dinv, are independent, and can be
211 executed in parallel.
212 */
213 mp_limb_t
214 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
215 mp_limb_t d, mp_limb_t dinv)
216 {
217 mp_limb_t B2;
218 mp_limb_t u0, u2;
219 mp_limb_t q0, q1;
220 mp_limb_t p0, p1;
221 mp_limb_t t;
222 mp_size_t j;
223
224 ASSERT (d & GMP_LIMB_HIGHBIT);
225 ASSERT (n > 0);
226 ASSERT (u1 < d);
227
228 if (n == 1)
229 {
230 udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
231 return u1;
232 }
233
234 /* FIXME: Could be precomputed */
235 B2 = -d*dinv;
236
237 umul_ppmm (q1, q0, dinv, u1);
238 umul_ppmm (p1, p0, B2, u1);
239 q1 += u1;
240 ASSERT (q1 >= u1);
241 u0 = up[n-1]; /* Early read, to allow qp == up. */
242 qp[n-1] = q1;
243
244 add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
245
246 /* FIXME: Keep q1 in a variable between iterations, to reduce number
247 of memory accesses. */
248 for (j = n-2; j-- > 0; )
249 {
250 mp_limb_t q2, cy;
251
252 /* Additions for the q update:
253 * +-------+
254 * |u1 * v |
255 * +---+---+
256 * | u1|
257 * +---+---+
258 * | 1 | v | (conditional on u2)
259 * +---+---+
260 * | 1 | (conditional on u0 + u2 B2 carry)
261 * +---+
262 * + | q0|
263 * -+---+---+---+
264 * | q2| q1| q0|
265 * +---+---+---+
266 */
267 umul_ppmm (p1, t, u1, dinv);
268 ADDC_LIMB (cy, u0, u0, u2 & B2);
269 u0 -= (-cy) & d;
270 add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
271 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
272 q0 = t;
273
274 /* Note that p1 + cy cannot overflow */
275 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy);
276
277 umul_ppmm (p1, p0, u1, B2);
278
279 qp[j+1] = q1;
280 MPN_INCR_U (qp+j+2, n-j-2, q2);
281
282 add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
283 }
284
285 q1 = (u2 > 0);
286 u1 -= (-q1) & d;
287
288 t = (u1 >= d);
289 q1 += t;
290 u1 -= (-t) & d;
291
292 udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
293 add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
294
295 MPN_INCR_U (qp+1, n-1, q1);
296
297 qp[0] = q0;
298 return u0;
299 }
300
301 #elif DIV_QR_1N_METHOD == 3
302
303 /* This variant handles carry from the u update earlier. This gives a
304 longer critical path, but reduces the work needed for the
305 quotients. */
306 mp_limb_t
307 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
308 mp_limb_t d, mp_limb_t dinv)
309 {
310 mp_limb_t B2;
311 mp_limb_t cy, u0;
312 mp_limb_t q0, q1;
313 mp_limb_t p0, p1;
314 mp_limb_t t;
315 mp_size_t j;
316
317 ASSERT (d & GMP_LIMB_HIGHBIT);
318 ASSERT (n > 0);
319 ASSERT (u1 < d);
320
321 if (n == 1)
322 {
323 udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
324 return u1;
325 }
326
327 /* FIXME: Could be precomputed */
328 B2 = -d*dinv;
329
330 umul_ppmm (q1, q0, dinv, u1);
331 umul_ppmm (p1, p0, B2, u1);
332 q1 += u1;
333 ASSERT (q1 >= u1);
334 u0 = up[n-1]; /* Early read, to allow qp == up. */
335
336 add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
337 u1 -= cy & d;
338 q1 -= cy;
339 qp[n-1] = q1;
340
341 /* FIXME: Keep q1 in a variable between iterations, to reduce number
342 of memory accesses. */
343 for (j = n-2; j-- > 0; )
344 {
345 mp_limb_t q2, cy;
346 mp_limb_t t1, t0;
347
348 /* Additions for the q update:
349 * +-------+
350 * |u1 * v |
351 * +---+---+
352 * | u1|
353 * +---+
354 * | 1 | (conditional on {u1, u0} carry)
355 * +---+
356 * + | q0|
357 * -+---+---+---+
358 * | q2| q1| q0|
359 * +---+---+---+
360 *
361 * Additions for the u update:
362 * +-------+
363 * |u1 * B2|
364 * +---+---+
365 * + |u0 |u-1|
366 * +---+---+
367 * - | d | (conditional on carry)
368 * ---+---+---+
369 * |u1 | u0|
370 * +---+---+
371 *
372 */
373 umul_ppmm (p1, p0, u1, B2);
374 ADDC_LIMB (q2, q1, u1, q0);
375 umul_ppmm (t1, t0, u1, dinv);
376 add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
377 u1 -= cy & d;
378
379 /* t1 <= B-2, so cy can be added in without overflow. */
380 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
381 q0 = t0;
382
383 /* Final q update */
384 qp[j+1] = q1;
385 MPN_INCR_U (qp+j+2, n-j-2, q2);
386 }
387
388 q1 = (u1 >= d);
389 u1 -= (-q1) & d;
390
391 udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
392 add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
393
394 MPN_INCR_U (qp+1, n-1, q1);
395
396 qp[0] = q0;
397 return u0;
398 }
399
400 #elif DIV_QR_1N_METHOD == 4
401
402 mp_limb_t
403 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
404 mp_limb_t d, mp_limb_t dinv)
405 {
406 mp_limb_t B2;
407 mp_limb_t u2, u0;
408 mp_limb_t q0, q1;
409 mp_limb_t p0, p1;
410 mp_limb_t B2d0, B2d1;
411 mp_limb_t t;
412 mp_size_t j;
413
414 ASSERT (d & GMP_LIMB_HIGHBIT);
415 ASSERT (n > 0);
416 ASSERT (u1 < d);
417
418 if (n == 1)
419 {
420 udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
421 return u1;
422 }
423
424 /* FIXME: Could be precomputed */
425 B2 = -d*dinv;
426 /* B2 * (B-d) */
427 umul_ppmm (B2d1, B2d0, B2, -d);
428
429 umul_ppmm (q1, q0, dinv, u1);
430 umul_ppmm (p1, p0, B2, u1);
431 q1 += u1;
432 ASSERT (q1 >= u1);
433
434 add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0);
435
436 /* After read of up[n-1], to allow qp == up. */
437 qp[n-1] = q1 - u2;
438
439 /* FIXME: Keep q1 in a variable between iterations, to reduce number
440 of memory accesses. */
441 for (j = n-2; j-- > 0; )
442 {
443 mp_limb_t q2, cy;
444 mp_limb_t t1, t0;
445
446 /* Additions for the q update. *After* u1 -= u2 & d adjustment.
447 * +-------+
448 * |u1 * v |
449 * +---+---+
450 * | u1|
451 * +---+
452 * | 1 | (conditional on {u1, u0} carry)
453 * +---+
454 * + | q0|
455 * -+---+---+---+
456 * | q2| q1| q0|
457 * +---+---+---+
458 *
459 * Additions for the u update. *Before* u1 -= u2 & d adjstment.
460 * +-------+
461 * |u1 * B2|
462 * +---+---+
463 * |u0 |u-1|
464 * +---+---+
465 + + |B2(B-d)| (conditional on u2)
466 * -+---+---+---+
467 * |u2 |u1 | u0|
468 * +---+---+---+
469 *
470 */
471 /* Multiply with unadjusted u1, to shorten critical path. */
472 umul_ppmm (p1, p0, u1, B2);
473 u1 -= (d & u2);
474 ADDC_LIMB (q2, q1, u1, q0);
475 umul_ppmm (t1, t0, u1, dinv);
476
477 add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0);
478 add_mssaaaa (u2, u1, u0, u1, u0, p1, p0);
479 u2 += cy;
480 ASSERT(-u2 <= 1);
481
482 /* t1 <= B-2, so u2 can be added in without overflow. */
483 add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2);
484 q0 = t0;
485
486 /* Final q update */
487 qp[j+1] = q1;
488 MPN_INCR_U (qp+j+2, n-j-2, q2);
489 }
490 u1 -= u2 & d;
491
492 q1 = (u1 >= d);
493 u1 -= (-q1) & d;
494
495 udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
496 add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
497
498 MPN_INCR_U (qp+1, n-1, q1);
499
500 qp[0] = q0;
501 return u0;
502 }
503 #else
504 #error Unknown DIV_QR_1N_METHOD
505 #endif