1 /* mpn_mod_1_1p (ap, n, b, cps)
2 Divide (ap,,n) by b. Return the single-limb remainder.
3
4 Contributed to the GNU project by Torbjorn Granlund and Niels Möller.
5 Based on a suggestion by Peter L. Montgomery.
6
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11 Copyright 2008-2011, 2013 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39 #include "gmp-impl.h"
40 #include "longlong.h"
41
42 #ifndef MOD_1_1P_METHOD
43 # define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */
44 #endif
45
46 /* Define some longlong.h-style macros, but for wider operations.
47 * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates
48 * carry out, in the form of a mask. */
49
50 #if defined (__GNUC__) && ! defined (NO_ASM)
51
52 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
53 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
54 __asm__ ( "add %6, %k2\n\t" \
55 "adc %4, %k1\n\t" \
56 "sbb %k0, %k0" \
57 : "=r" (m), "=r" (s1), "=&r" (s0) \
58 : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \
59 "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
60 #endif
61
62 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
63 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
64 __asm__ ( "add %6, %q2\n\t" \
65 "adc %4, %q1\n\t" \
66 "sbb %q0, %q0" \
67 : "=r" (m), "=r" (s1), "=&r" (s0) \
68 : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \
69 "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
70 #endif
71
72 #if defined (__sparc__) && W_TYPE_SIZE == 32
73 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
74 __asm__ ( "addcc %r5, %6, %2\n\t" \
75 "addxcc %r3, %4, %1\n\t" \
76 "subx %%g0, %%g0, %0" \
77 : "=r" (m), "=r" (sh), "=&r" (sl) \
78 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
79 __CLOBBER_CC)
80 #endif
81
82 #if defined (__sparc__) && W_TYPE_SIZE == 64
83 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
84 __asm__ ( "addcc %r5, %6, %2\n\t" \
85 "addccc %r7, %8, %%g0\n\t" \
86 "addccc %r3, %4, %1\n\t" \
87 "clr %0\n\t" \
88 "movcs %%xcc, -1, %0" \
89 : "=r" (m), "=r" (sh), "=&r" (sl) \
90 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
91 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
92 __CLOBBER_CC)
93 #if __VIS__ >= 0x300
94 #undef add_mssaaaa
95 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
96 __asm__ ( "addcc %r5, %6, %2\n\t" \
97 "addxccc %r3, %4, %1\n\t" \
98 "clr %0\n\t" \
99 "movcs %%xcc, -1, %0" \
100 : "=r" (m), "=r" (sh), "=&r" (sl) \
101 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \
102 __CLOBBER_CC)
103 #endif
104 #endif
105
106 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
107 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
108 processor running in 32-bit mode, since the carry flag then gets the 32-bit
109 carry. */
110 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
111 __asm__ ( "add%I6c %2, %5, %6\n\t" \
112 "adde %1, %3, %4\n\t" \
113 "subfe %0, %0, %0\n\t" \
114 "nor %0, %0, %0" \
115 : "=r" (m), "=r" (s1), "=&r" (s0) \
116 : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \
117 __CLOBBER_CC)
118 #endif
119
120 #if defined (__s390x__) && W_TYPE_SIZE == 64
121 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
122 __asm__ ( "algr %2, %6\n\t" \
123 "alcgr %1, %4\n\t" \
124 "lghi %0, 0\n\t" \
125 "alcgr %0, %0\n\t" \
126 "lcgr %0, %0" \
127 : "=r" (m), "=r" (s1), "=&r" (s0) \
128 : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \
129 "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
130 #endif
131
132 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
133 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
134 __asm__ ( "adds %2, %5, %6\n\t" \
135 "adcs %1, %3, %4\n\t" \
136 "movcc %0, #0\n\t" \
137 "movcs %0, #-1" \
138 : "=r" (m), "=r" (sh), "=&r" (sl) \
139 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
140 #endif
141
142 #if defined (__aarch64__) && W_TYPE_SIZE == 64
143 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \
144 __asm__ ( "adds %2, %x5, %6\n\t" \
145 "adcs %1, %x3, %x4\n\t" \
146 "csinv %0, xzr, xzr, cc\n\t" \
147 : "=r" (m), "=r" (sh), "=&r" (sl) \
148 : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC)
149 #endif
150 #endif /* defined (__GNUC__) */
151
152 #ifndef add_mssaaaa
153 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \
154 do { \
155 UWtype __s0, __s1, __c0, __c1; \
156 __s0 = (a0) + (b0); \
157 __s1 = (a1) + (b1); \
158 __c0 = __s0 < (a0); \
159 __c1 = __s1 < (a1); \
160 (s0) = __s0; \
161 __s1 = __s1 + __c0; \
162 (s1) = __s1; \
163 (m) = - (__c1 + (__s1 < __c0)); \
164 } while (0)
165 #endif
166
167 #if MOD_1_1P_METHOD == 1
168 void
169 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
170 {
171 mp_limb_t bi;
172 mp_limb_t B1modb, B2modb;
173 int cnt;
174
175 count_leading_zeros (cnt, b);
176
177 b <<= cnt;
178 invert_limb (bi, b);
179
180 cps[0] = bi;
181 cps[1] = cnt;
182
183 B1modb = -b;
184 if (LIKELY (cnt != 0))
185 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
186 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
187 cps[2] = B1modb >> cnt;
188
189 /* In the normalized case, this can be simplified to
190 *
191 * B2modb = - b * bi;
192 * ASSERT (B2modb <= b); // NB: equality iff b = B/2
193 */
194 udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi);
195 cps[3] = B2modb >> cnt;
196 }
197
198 mp_limb_t
199 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
200 {
201 mp_limb_t rh, rl, bi, ph, pl, r;
202 mp_limb_t B1modb, B2modb;
203 mp_size_t i;
204 int cnt;
205 mp_limb_t mask;
206
207 ASSERT (n >= 2); /* fix tuneup.c if this is changed */
208
209 B1modb = bmodb[2];
210 B2modb = bmodb[3];
211
212 rl = ap[n - 1];
213 umul_ppmm (ph, pl, rl, B1modb);
214 add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]);
215
216 for (i = n - 3; i >= 0; i -= 1)
217 {
218 /* rr = ap[i] < B
219 + LO(rr) * (B mod b) <= (B-1)(b-1)
220 + HI(rr) * (B^2 mod b) <= (B-1)(b-1)
221 */
222 umul_ppmm (ph, pl, rl, B1modb);
223 add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]);
224
225 umul_ppmm (rh, rl, rh, B2modb);
226 add_ssaaaa (rh, rl, rh, rl, ph, pl);
227 }
228
229 cnt = bmodb[1];
230 bi = bmodb[0];
231
232 if (LIKELY (cnt != 0))
233 rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt));
234
235 mask = -(mp_limb_t) (rh >= b);
236 rh -= mask & b;
237
238 udiv_rnnd_preinv (r, rh, rl << cnt, b, bi);
239
240 return r >> cnt;
241 }
242 #endif /* MOD_1_1P_METHOD == 1 */
243
244 #if MOD_1_1P_METHOD == 2
245 void
246 mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b)
247 {
248 mp_limb_t bi;
249 mp_limb_t B2modb;
250 int cnt;
251
252 count_leading_zeros (cnt, b);
253
254 b <<= cnt;
255 invert_limb (bi, b);
256
257 cps[0] = bi;
258 cps[1] = cnt;
259
260 if (LIKELY (cnt != 0))
261 {
262 mp_limb_t B1modb = -b;
263 B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt));
264 ASSERT (B1modb <= b); /* NB: not fully reduced mod b */
265 cps[2] = B1modb >> cnt;
266 }
267 B2modb = - b * bi;
268 ASSERT (B2modb <= b); // NB: equality iff b = B/2
269 cps[3] = B2modb;
270 }
271
272 mp_limb_t
273 mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4])
274 {
275 int cnt;
276 mp_limb_t bi, B1modb;
277 mp_limb_t r0, r1;
278 mp_limb_t r;
279
280 ASSERT (n >= 2); /* fix tuneup.c if this is changed */
281
282 r0 = ap[n-2];
283 r1 = ap[n-1];
284
285 if (n > 2)
286 {
287 mp_limb_t B2modb, B2mb;
288 mp_limb_t p0, p1;
289 mp_limb_t r2;
290 mp_size_t j;
291
292 B2modb = bmodb[3];
293 B2mb = B2modb - b;
294
295 umul_ppmm (p1, p0, r1, B2modb);
296 add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0);
297
298 for (j = n-4; j >= 0; j--)
299 {
300 mp_limb_t cy;
301 /* mp_limb_t t = r0 + B2mb; */
302 umul_ppmm (p1, p0, r1, B2modb);
303
304 ADDC_LIMB (cy, r0, r0, r2 & B2modb);
305 /* Alternative, for cmov: if (cy) r0 = t; */
306 r0 -= (-cy) & b;
307 add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0);
308 }
309
310 r1 -= (r2 & b);
311 }
312
313 cnt = bmodb[1];
314
315 if (LIKELY (cnt != 0))
316 {
317 mp_limb_t t;
318 mp_limb_t B1modb = bmodb[2];
319
320 umul_ppmm (r1, t, r1, B1modb);
321 r0 += t;
322 r1 += (r0 < t);
323
324 /* Normalize */
325 r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt));
326 r0 <<= cnt;
327
328 /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */
329 }
330 else
331 {
332 mp_limb_t mask = -(mp_limb_t) (r1 >= b);
333 r1 -= mask & b;
334 }
335
336 bi = bmodb[0];
337
338 udiv_rnnd_preinv (r, r1, r0, b, bi);
339 return r >> cnt;
340 }
341 #endif /* MOD_1_1P_METHOD == 2 */