1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
2 size. Or more accurately, bn <= an < (3/2)bn.
3
4 Contributed to the GNU project by Torbjorn Granlund.
5 Additional improvements by Marco Bodrato.
6
7 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
8 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10
11 Copyright 2006-2008, 2010, 2012, 2015, 2021 Free Software Foundation, Inc.
12
13 This file is part of the GNU MP Library.
14
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of either:
17
18 * the GNU Lesser General Public License as published by the Free
19 Software Foundation; either version 3 of the License, or (at your
20 option) any later version.
21
22 or
23
24 * the GNU General Public License as published by the Free Software
25 Foundation; either version 2 of the License, or (at your option) any
26 later version.
27
28 or both in parallel, as here.
29
30 The GNU MP Library is distributed in the hope that it will be useful, but
31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
32 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
33 for more details.
34
35 You should have received copies of the GNU General Public License and the
36 GNU Lesser General Public License along with the GNU MP Library. If not,
37 see https://www.gnu.org/licenses/. */
38
39
40 #include "gmp-impl.h"
41
42 /* Evaluate in: -1, 0, +1, +2, +inf
43
44 <-s--><--n--><--n-->
45 ____ ______ ______
46 |_a2_|___a1_|___a0_|
47 |b2_|___b1_|___b0_|
48 <-t-><--n--><--n-->
49
50 v0 = a0 * b0 # A(0)*B(0)
51 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
52 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
53 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
54 vinf= a2 * b2 # A(inf)*B(inf)
55 */
56
57 #if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY
58 #define MAYBE_mul_basecase 1
59 #define MAYBE_mul_toom33 1
60 #else
61 #define MAYBE_mul_basecase \
62 (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
63 #define MAYBE_mul_toom33 \
64 (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
65 #endif
66
67 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
68 multiplication at the infinity point. We may have
69 MAYBE_mul_basecase == 0, and still get s just below
70 MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
71 s == 1 and mpn_toom22_mul will crash.
72 */
73
74 #define TOOM33_MUL_N_REC(p, a, b, n, ws) \
75 do { \
76 if (MAYBE_mul_basecase \
77 && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \
78 mpn_mul_basecase (p, a, n, b, n); \
79 else if (! MAYBE_mul_toom33 \
80 || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \
81 mpn_toom22_mul (p, a, n, b, n, ws); \
82 else \
83 mpn_toom33_mul (p, a, n, b, n, ws); \
84 } while (0)
85
86 void
87 mpn_toom33_mul (mp_ptr pp,
88 mp_srcptr ap, mp_size_t an,
89 mp_srcptr bp, mp_size_t bn,
90 mp_ptr scratch)
91 {
92 const int __gmpn_cpuvec_initialized = 1;
93 mp_size_t n, s, t;
94 int vm1_neg;
95 mp_limb_t cy, vinf0;
96 mp_ptr gp;
97 mp_ptr as1, asm1, as2;
98 mp_ptr bs1, bsm1, bs2;
99
100 #define a0 ap
101 #define a1 (ap + n)
102 #define a2 (ap + 2*n)
103 #define b0 bp
104 #define b1 (bp + n)
105 #define b2 (bp + 2*n)
106
107 n = (an + 2) / (size_t) 3;
108
109 s = an - 2 * n;
110 t = bn - 2 * n;
111
112 ASSERT (an >= bn);
113
114 ASSERT (0 < s && s <= n);
115 ASSERT (0 < t && t <= n);
116
117 as1 = scratch + 4 * n + 4;
118 asm1 = scratch + 2 * n + 2;
119 as2 = pp + n + 1;
120
121 bs1 = pp;
122 bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
123 bs2 = pp + 2 * n + 2;
124
125 gp = scratch;
126
127 vm1_neg = 0;
128
129 /* Compute as1 and asm1. */
130 cy = mpn_add (gp, a0, n, a2, s);
131 #if HAVE_NATIVE_mpn_add_n_sub_n
132 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
133 {
134 cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
135 as1[n] = cy >> 1;
136 asm1[n] = 0;
137 vm1_neg = 1;
138 }
139 else
140 {
141 mp_limb_t cy2;
142 cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
143 as1[n] = cy + (cy2 >> 1);
144 asm1[n] = cy - (cy2 & 1);
145 }
146 #else
147 as1[n] = cy + mpn_add_n (as1, gp, a1, n);
148 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
149 {
150 mpn_sub_n (asm1, a1, gp, n);
151 asm1[n] = 0;
152 vm1_neg = 1;
153 }
154 else
155 {
156 cy -= mpn_sub_n (asm1, gp, a1, n);
157 asm1[n] = cy;
158 }
159 #endif
160
161 /* Compute as2. */
162 #if HAVE_NATIVE_mpn_rsblsh1_n
163 cy = mpn_add_n (as2, a2, as1, s);
164 if (s != n)
165 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
166 cy += as1[n];
167 cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
168 #else
169 #if HAVE_NATIVE_mpn_addlsh1_n
170 cy = mpn_addlsh1_n (as2, a1, a2, s);
171 if (s != n)
172 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
173 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
174 #else
175 cy = mpn_add_n (as2, a2, as1, s);
176 if (s != n)
177 cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
178 cy += as1[n];
179 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
180 cy -= mpn_sub_n (as2, as2, a0, n);
181 #endif
182 #endif
183 as2[n] = cy;
184
185 /* Compute bs1 and bsm1. */
186 cy = mpn_add (gp, b0, n, b2, t);
187 #if HAVE_NATIVE_mpn_add_n_sub_n
188 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
189 {
190 cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
191 bs1[n] = cy >> 1;
192 bsm1[n] = 0;
193 vm1_neg ^= 1;
194 }
195 else
196 {
197 mp_limb_t cy2;
198 cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
199 bs1[n] = cy + (cy2 >> 1);
200 bsm1[n] = cy - (cy2 & 1);
201 }
202 #else
203 bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
204 if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
205 {
206 mpn_sub_n (bsm1, b1, gp, n);
207 bsm1[n] = 0;
208 vm1_neg ^= 1;
209 }
210 else
211 {
212 cy -= mpn_sub_n (bsm1, gp, b1, n);
213 bsm1[n] = cy;
214 }
215 #endif
216
217 /* Compute bs2. */
218 #if HAVE_NATIVE_mpn_rsblsh1_n
219 cy = mpn_add_n (bs2, b2, bs1, t);
220 if (t != n)
221 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
222 cy += bs1[n];
223 cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
224 #else
225 #if HAVE_NATIVE_mpn_addlsh1_n
226 cy = mpn_addlsh1_n (bs2, b1, b2, t);
227 if (t != n)
228 cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
229 cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
230 #else
231 cy = mpn_add_n (bs2, bs1, b2, t);
232 if (t != n)
233 cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
234 cy += bs1[n];
235 cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
236 cy -= mpn_sub_n (bs2, bs2, b0, n);
237 #endif
238 #endif
239 bs2[n] = cy;
240
241 ASSERT (as1[n] <= 2);
242 ASSERT (bs1[n] <= 2);
243 ASSERT (asm1[n] <= 1);
244 ASSERT (bsm1[n] <= 1);
245 ASSERT (as2[n] <= 6);
246 ASSERT (bs2[n] <= 6);
247
248 #define v0 pp /* 2n */
249 #define v1 (pp + 2 * n) /* 2n+1 */
250 #define vinf (pp + 4 * n) /* s+t */
251 #define vm1 scratch /* 2n+1 */
252 #define v2 (scratch + 2 * n + 1) /* 2n+2 */
253 #define scratch_out (scratch + 5 * n + 5)
254
255 /* vm1, 2n+1 limbs */
256 #ifdef SMALLER_RECURSION
257 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
258 cy = 0;
259 if (asm1[n] != 0)
260 cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
261 if (bsm1[n] != 0)
262 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
263 vm1[2 * n] = cy;
264 #else
265 vm1[2 * n] = 0;
266 TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + (bsm1[n] | asm1[n]), scratch_out);
267 #endif
268
269 TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */
270
271 /* vinf, s+t limbs */
272 if (s > t) mpn_mul (vinf, a2, s, b2, t);
273 else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
274
275 vinf0 = vinf[0]; /* v1 overlaps with this */
276
277 #ifdef SMALLER_RECURSION
278 /* v1, 2n+1 limbs */
279 TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
280 if (as1[n] == 1)
281 {
282 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
283 }
284 else if (as1[n] != 0)
285 {
286 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
287 cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n);
288 #else
289 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
290 #endif
291 }
292 else
293 cy = 0;
294 if (bs1[n] == 1)
295 {
296 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
297 }
298 else if (bs1[n] != 0)
299 {
300 #if HAVE_NATIVE_mpn_addlsh1_n_ip1
301 cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n);
302 #else
303 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
304 #endif
305 }
306 v1[2 * n] = cy;
307 #else
308 cy = vinf[1];
309 TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
310 vinf[1] = cy;
311 #endif
312
313 TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */
314
315 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
316 }