1 /* Addmul_2 / mul_2 for IBM z13 or later
2
3 Copyright 2021 Free Software Foundation, Inc.
4
5 This file is part of the GNU MP Library.
6
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9
10 * the GNU Lesser General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13
14 or
15
16 * the GNU General Public License as published by the Free Software
17 Foundation; either version 2 of the License, or (at your option) any
18 later version.
19
20 or both in parallel, as here.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 for more details.
26
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library. If not,
29 see https://www.gnu.org/licenses/. */
30
31 #include "gmp-impl.h"
32
33 #include "s390_64/z13/common-vec.h"
34
35 #undef FUNCNAME
36
37 #ifdef DO_INLINE
38 # ifdef OPERATION_addmul_2
39 # define ADD
40 # define FUNCNAME inline_addmul_2
41 # elif defined(OPERATION_mul_2)
42 # define FUNCNAME inline_mul_2
43 # else
44 # error Missing define for operation to perform
45 # endif
46 #else
47 # ifdef OPERATION_addmul_2
48 # define ADD
49 # define FUNCNAME mpn_addmul_2
50 # elif defined(OPERATION_mul_2)
51 # define FUNCNAME mpn_mul_2
52 # else
53 # error Missing define for operation to perform
54 # endif
55 #endif
56
57 #ifdef DO_INLINE
58 static inline mp_limb_t
59 FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp)
60 __attribute__ ((always_inline));
61
62 static inline
63 #endif
64 mp_limb_t
65 FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n,
66 const mp_limb_t *vp)
67 {
68
69 /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
70 VRs (using each VR as a single 128-bit accumulator).
71 The inner loop is unrolled to four limbs, with two blocks of four
72 multiplications each. Since the MLGR operation operates on even/odd GPR
73 pairs, pin the products appropriately. */
74
75 register mp_limb_t p0_high asm("r0");
76 register mp_limb_t p0_low asm("r1");
77
78 register mp_limb_t p1_high asm("r8");
79 register mp_limb_t p1_low asm("r9");
80
81 register mp_limb_t p2_high asm("r6");
82 register mp_limb_t p2_low asm("r7");
83
84 register mp_limb_t p3_high asm("r10");
85 register mp_limb_t p3_low asm("r11");
86
87 vec_t carry_prod = { .dw = vec_splat_u64 (0) };
88 vec_t zero = { .dw = vec_splat_u64 (0) };
89
90 /* two carry-bits for the 128-bit VR adds - stored in VRs */
91 #ifdef ADD
92 vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
93 #endif
94 vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
95
96 vec_t tmp;
97
98 vec_t sum0, sum1;
99
100 /* products transferred into VRs for accumulating there */
101 vec_t pv0, pv3;
102 vec_t pv1_low, pv1_high, pv2_low, pv2_high;
103 vec_t low, middle, high;
104 #ifdef ADD
105 vec_t rp0, rp1;
106 #endif
107
108 register mp_limb_t v0 asm("r12");
109 register mp_limb_t v1 asm("r5");
110 v0 = vp[0];
111 v1 = vp[1];
112
113 /* The scalar multiplications compete with pointer and index increments for
114 * issue ports. Thus, increment the loop index in the middle of the loop so
115 * that the operations for the next iteration's multiplications can be
116 * loaded in time (looks horrible, yet helps performance) and make sure we
117 * use addressing with base reg + index reg + immediate displacement
118 * (so that only the single index needs incrementing, instead of multiple
119 * pointers). */
120 #undef LOOP_ADVANCE
121 #define LOOP_ADVANCE (4 * sizeof (mp_limb_t))
122 #define IDX_OFFSET (LOOP_ADVANCE)
123
124 register ssize_t idx = 0 - IDX_OFFSET;
125 #ifdef BRCTG
126 ssize_t iterations = (size_t)n / 4;
127 #else
128 ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
129 #endif
130
131 /*
132 * To minimize latency in the carry chain, accumulate in VRs with 128-bit
133 * adds with carry in and out. As a downside, these require two insns for
134 * each add - one to calculate the sum, one to deliver the carry out.
135 * To reduce the overall number of insns to execute, combine adding up
136 * product limbs such that there cannot be a carry out and one (for mul) or
137 * two (for addmul) adds with carry chains.
138 *
139 * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two
140 * limbs into each 128-bit product without causing carry out.
141 *
142 * For each block of 2 limbs * 2 limbs
143 *
144 * | u[i] * v[0] (p2) |
145 * | u[i] * v[1] (p0) |
146 * | u[i+1] * v[0](p1) |
147 * | u[i+1] * v[1](p3) |
148 * < 128 bits > < 128 bits >
149 *
150 * we can begin accumulating with "simple" carry-oblivious 128-bit adds:
151 * - p0 + low limb of p1
152 * + high limb of p2
153 * and combine resulting low limb with p2's low limb
154 * - p3 + high limb of p1
155 * + high limb of sum above
156 * ... which will will result in two 128-bit limbs to be fed into the carry
157 * chain(s).
158 * Overall, that scheme saves instructions and improves performance, despite
159 * slightly increasing latency between multiplications and carry chain (yet
160 * not in the carry chain).
161 */
162
163 #define LOAD_LOW_LIMB(VEC, LIMB) \
164 do \
165 { \
166 asm("vzero\t%[vec]\n\t" \
167 "vlvgg\t%[vec],%[limb],1" \
168 : [vec] "=v"(VEC) \
169 : [limb] "r"(LIMB)); \
170 } \
171 while (0)
172
173 /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we
174 * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new
175 * carry) */
176 #define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \
177 do \
178 { \
179 sum##SUMIDX.sw \
180 = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \
181 carry_vec##CARRYIDX.sw \
182 = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \
183 } \
184 while (0)
185
186 #define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2) \
187 ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
188
189 /* variant without carry-in for prologue */
190 #define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \
191 do \
192 { \
193 sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw); \
194 carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw); \
195 } \
196 while (0)
197
198 #define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2) \
199 ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
200
201 /* prologue for 4x-unrolled main loop */
202 switch ((size_t)n % 4)
203 {
204 case 1:
205 ASM_LOADGPR_BASE (p0_low, up, 0);
206 ASM_LOADGPR_BASE (p1_low, up, 0);
207 s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
208 carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
209
210 /* gcc tries to be too clever and vlr from a reg that is already zero. vzero is
211 * cheaper. */
212 # define NEW_CARRY(VEC, LIMB) \
213 do \
214 { \
215 asm("vzero\t%[vec]\n\t" \
216 "vlvgg\t%[vec],%[limb],1" \
217 : [vec] "=v"(VEC) \
218 : [limb] "r"(LIMB)); \
219 } \
220 while (0)
221
222 NEW_CARRY (tmp, p0_high);
223
224 carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
225 #ifdef ADD
226 carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
227 #else
228 rp[0] = p0_low;
229 #endif
230 idx += sizeof (mp_limb_t);
231 break;
232
233 case 2:
234 ASM_LOADGPR_BASE (p0_low, up, 0);
235 ASM_LOADGPR_BASE (p1_low, up, 8);
236 ASM_LOADGPR_BASE (p2_low, up, 0);
237 ASM_LOADGPR_BASE (p3_low, up, 8);
238
239 asm(""
240 : "=r"(p0_low), "=r"(p2_low)
241 : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
242 s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
243 s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
244
245 pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
246 LOAD_LOW_LIMB (pv1_low, p1_low);
247 LOAD_LOW_LIMB (pv1_high, p1_high);
248 pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
249 LOAD_LOW_LIMB (pv2_high, p2_high);
250 pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
251 LOAD_LOW_LIMB (pv2_low, p2_low);
252 pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
253 middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
254 low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
255 middle.dw = vec_permi (zero.dw, middle.dw, 0);
256 high.sw = vec_add_u128 (middle.sw, pv3.sw);
257 #ifdef ADD
258 rp0 = vec_load_elements_reversed (rp, 0);
259 ADD_UP_CARRY_OUT (0, rp0, carry_prod);
260 #else
261 sum0 = carry_prod;
262 #endif
263 ADD_UP_CARRY_OUT (1, sum0, low);
264 vec_store_elements_reversed (rp, 0, sum1);
265 carry_prod = high;
266
267 idx += 2 * sizeof (mp_limb_t);
268 break;
269
270 case 3:
271 ASM_LOADGPR_BASE (p0_low, up, 0);
272 ASM_LOADGPR_BASE (p1_low, up, 0);
273 s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
274 carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
275 NEW_CARRY (tmp, p0_high);
276 carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
277
278 #ifdef ADD
279 carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
280 #else
281 rp[0] = p0_low;
282 #endif
283
284 ASM_LOADGPR_BASE (p0_low, up, 8);
285 ASM_LOADGPR_BASE (p1_low, up, 16);
286 ASM_LOADGPR_BASE (p2_low, up, 8);
287 ASM_LOADGPR_BASE (p3_low, up, 16);
288
289 asm(""
290 : "=r"(p0_low), "=r"(p2_low)
291 : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
292 s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
293 s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
294
295 pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
296
297 LOAD_LOW_LIMB (pv1_low, p1_low);
298 LOAD_LOW_LIMB (pv1_high, p1_high);
299
300 pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
301 LOAD_LOW_LIMB (pv2_high, p2_high);
302 pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
303
304 LOAD_LOW_LIMB (pv2_low, p2_low);
305
306 pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
307 middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
308
309 low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
310 middle.dw = vec_permi (zero.dw, middle.dw, 0);
311 high.sw = vec_add_u128 (middle.sw, pv3.sw);
312
313 #ifdef ADD
314 vec_t rp0 = vec_load_elements_reversed (rp, 8);
315 ADD_UP_CARRY_OUT (0, rp0, carry_prod);
316 #else
317 sum0 = carry_prod;
318 #endif
319 ADD_UP_CARRY_INOUT (1, sum0, low);
320
321 vec_store_elements_reversed (rp, 8, sum1);
322
323 carry_prod = high;
324
325 idx += 3 * sizeof (mp_limb_t);
326 break;
327 }
328
329 /*
330 * branch-on-count implicitly hint to the branch prediction as taken, while
331 * compare-and-branch hints as not taken. currently, using branch-on-count
332 * has a performance advantage, but it is not clear that it is generally
333 * the better choice (e.g., branch-on-count requires decrementing the
334 * separate counter). so, allow switching the loop condition to enable
335 * either category of branch instructions:
336 * - idx is less than an upper bound, for compare-and-branch
337 * - iteration counter greater than zero, for branch-on-count
338 */
339 #ifdef BRCTG
340 for (; iterations > 0; iterations--)
341 {
342 #else
343 while (idx < idx_bound)
344 {
345 #endif
346 /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
347 * result in a GPR pair. One of the factors is taken from the GPR pair
348 * and overwritten.
349 * To reuse factors, it turned out cheaper to load limbs multiple times
350 * than copying GPR contents. Enforce that and the use of addressing by
351 * base + index gpr + immediate displacement via inline asm.
352 */
353 ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET);
354 ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET);
355 ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET);
356 ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET);
357
358 s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
359
360 pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
361
362 LOAD_LOW_LIMB (pv1_low, p1_low);
363 LOAD_LOW_LIMB (pv1_high, p1_high);
364
365 s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
366
367 pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
368 LOAD_LOW_LIMB (pv2_high, p2_high);
369 pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
370
371 LOAD_LOW_LIMB (pv2_low, p2_low);
372
373 ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET);
374 ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET);
375 ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET);
376 ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET);
377
378 idx += LOOP_ADVANCE;
379
380 /*
381 * "barrier" to enforce scheduling the index increment before the second
382 * block of multiplications. not required for clang.
383 */
384 #ifndef __clang__
385 asm(""
386 : "=r"(idx), "=r"(p0_high), "=r"(p2_high)
387 : "0"(idx), "1"(p0_high), "2"(p2_high));
388 #endif
389
390 s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
391 s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
392
393 /*
394 * "barrier" to enforce scheduling all MLGRs first, before any adding
395 * up. note that clang produces better code without.
396 */
397 #ifndef __clang__
398 asm(""
399 : "=v"(pv0.sw), "=v"(pv3.sw)
400 : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high));
401 #endif
402
403 pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
404 middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
405
406 low.dw = vec_permi (middle.dw, pv2_low.dw,
407 3); /* least-significant doubleword from both vectors */
408 middle.dw = vec_permi (zero.dw, middle.dw, 0);
409 high.sw = vec_add_u128 (middle.sw, pv3.sw);
410
411 #ifdef ADD
412 rp0 = vec_load_elements_reversed_idx (rp, idx,
413 0 + IDX_OFFSET - LOOP_ADVANCE);
414 ADD_UP_CARRY_INOUT (0, rp0, carry_prod);
415 #else
416 sum0 = carry_prod;
417 #endif
418 ADD_UP_CARRY_INOUT (1, sum0, low);
419
420 vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE,
421 sum1);
422
423 carry_prod = high;
424
425 vec_t pv0_2, pv3_2;
426 vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2;
427 vec_t low_2, middle_2, high_2;
428 vec_t sum2, sum3;
429
430 pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low);
431 LOAD_LOW_LIMB (pv1_low_2, p1_low);
432 LOAD_LOW_LIMB (pv1_high_2, p1_high);
433
434 pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw);
435 LOAD_LOW_LIMB (pv2_high_2, p2_high);
436 pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low);
437 pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw);
438 middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw);
439
440 LOAD_LOW_LIMB (pv2_low_2, p2_low);
441 low_2.dw
442 = vec_permi (middle_2.dw, pv2_low_2.dw,
443 3); /* least-significant doubleword from both vectors */
444 middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0);
445 high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw);
446
447 /*
448 * another "barrier" to influence scheduling. (also helps in clang)
449 */
450 asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw));
451
452 #ifdef ADD
453 rp1 = vec_load_elements_reversed_idx (rp, idx,
454 16 + IDX_OFFSET - LOOP_ADVANCE);
455 ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod);
456 #else
457 sum2 = carry_prod;
458 #endif
459 ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2);
460
461 vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
462 sum3);
463
464 carry_prod = high_2;
465 }
466
467 #ifdef ADD
468 sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw);
469 #else
470 sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw);
471 #endif
472
473 *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1];
474
475 return (mp_limb_t)sum0.dw[0];
476 }