1 /* Addmul_1 / mul_1 for IBM z13 and later
2 Contributed by Marius Hillenbrand
3
4 Copyright 2021 Free Software Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
14
15 or
16
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
19 later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
31
32 #include "gmp-impl.h"
33 #include "s390_64/z13/common-vec.h"
34
35 #undef FUNCNAME
36
37 #ifdef DO_INLINE
38 # ifdef OPERATION_addmul_1
39 # define ADD
40 # define FUNCNAME inline_addmul_1
41 # elif defined(OPERATION_mul_1)
42 # define FUNCNAME inline_mul_1
43 # endif
44
45 #else
46 # ifdef OPERATION_addmul_1
47 # define ADD
48 # define FUNCNAME mpn_addmul_1
49 # elif defined(OPERATION_mul_1)
50 # define FUNCNAME mpn_mul_1
51 # endif
52 #endif
53
54 #ifdef DO_INLINE
55 static inline mp_limb_t
56 FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
57 __attribute__ ((always_inline));
58
59 static inline
60 #endif
61 mp_limb_t
62 FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
63 {
64 ASSERT (n >= 1);
65 ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n));
66
67 /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
68 VRs (using each VR as a single 128-bit accumulator).
69 The inner loop is unrolled to four limbs, with two blocks of four
70 multiplications each. Since the MLGR operation operates on even/odd GPR
71 pairs, pin the products appropriately. */
72
73 /* products as GPR pairs */
74 register mp_limb_t p0_high asm("r0");
75 register mp_limb_t p0_low asm("r1");
76
77 register mp_limb_t p1_high asm("r8");
78 register mp_limb_t p1_low asm("r9");
79
80 register mp_limb_t p2_high asm("r6");
81 register mp_limb_t p2_low asm("r7");
82
83 register mp_limb_t p3_high asm("r10");
84 register mp_limb_t p3_low asm("r11");
85
86 /* carry flag for 128-bit add in VR for first carry chain */
87 vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
88 mp_limb_t carry_limb = 0;
89
90 #ifdef ADD
91 /* 2nd carry flag for 2nd carry chain with addmul */
92 vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
93 vec_t sum0;
94 vec_t rp0_addend, rp1_addend;
95 rp0_addend.dw = vec_splat_u64 (0);
96 rp1_addend.dw = vec_splat_u64 (0);
97 #endif
98 vec_t sum1;
99
100 vec_t carry_prod = { .dw = vec_splat_u64 (0) };
101
102 /* The scalar multiplications compete with pointer and index increments for
103 * issue ports. Thus, increment the loop index in the middle of the loop so
104 * that the operations for the next iteration's multiplications can be
105 * loaded in time (looks horrible, yet helps performance) and make sure we
106 * use addressing with base reg + index reg + immediate displacement
107 * (so that only the single index needs incrementing, instead of multiple
108 * pointers). */
109 #undef LOOP_ADVANCE
110 #undef IDX_OFFSET
111
112 #define LOOP_ADVANCE 4 * sizeof (mp_limb_t)
113 #define IDX_OFFSET (LOOP_ADVANCE)
114 register ssize_t idx = 0 - IDX_OFFSET;
115
116 /*
117 * branch-on-count implicitly hint to the branch prediction as taken, while
118 * compare-and-branch hints as not taken. currently, using branch-on-count
119 * has a performance advantage, but it is not clear that it is generally the
120 * better choice (e.g., branch-on-count requires decrementing the separate
121 * counter). so, allow switching the loop condition to enable either
122 * category of branch instructions:
123 * - idx is less than an upper bound, for compare-and-branch
124 * - iteration counter greater than zero, for branch-on-count
125 */
126 #define BRCTG
127 #ifdef BRCTG
128 ssize_t iterations = (size_t)n / 4;
129 #else
130 ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
131 #endif
132
133 /* products will be transferred into VRs before adding up.
134 * see main loop below for comments on accumulation scheme. */
135 vec_t product0, product1, product2;
136
137 product0.dw = vec_splat_u64 (0);
138
139 switch ((size_t)n % 4)
140 {
141 case 0:
142 break;
143
144 case 1:
145 idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET;
146
147 p3_low = s1p[0];
148 s390_umul_ppmm (p3_high, p3_low, s2limb);
149
150 #ifdef ADD
151 rp0_addend.dw[1] = rp[0];
152 product0.dw[1] = p3_low;
153
154 sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
155 carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
156
157 rp[0] = sum0.dw[1];
158 #else
159 rp[0] = p3_low;
160 #endif
161
162 carry_limb = p3_high;
163 break;
164
165 case 2:
166 p0_low = s1p[0];
167 p3_low = s1p[1];
168 idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET;
169
170 s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb);
171
172 carry_prod.dw[0] = p3_low;
173
174 product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
175
176 carry_limb = p3_high;
177
178 #ifdef ADD
179 rp0_addend = vec_load_elements_reversed (rp, 0);
180 sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
181 carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
182
183 sum1.sw = vec_add_u128 (sum0.sw, product0.sw);
184 carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw);
185 #else
186 sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw);
187 carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw);
188 #endif
189
190 vec_store_elements_reversed (rp, 0, sum1);
191
192 break;
193
194 case 3:
195 idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET;
196
197 p0_low = s1p[0];
198 s390_umul_ppmm (p0_high, p0_low, s2limb);
199
200 #ifdef ADD
201 rp0_addend.dw[1] = rp[0];
202 product0.dw[1] = p0_low;
203
204 sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
205 carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
206
207 rp[0] = sum0.dw[1];
208 #else
209 rp[0] = p0_low;
210 #endif
211 carry_limb = p0_high;
212
213 p1_low = s1p[1];
214 p3_low = s1p[2];
215
216 s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb);
217
218 carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb);
219 product1.dw = vec_load_2di_as_pair (p1_high, p1_low);
220 carry_limb = p3_high;
221
222 #ifdef ADD
223 rp0_addend = vec_load_elements_reversed (rp, 8);
224 sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
225 carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
226
227 sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw);
228 carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw);
229 #else
230 sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
231 carry_vec0.sw
232 = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
233 #endif
234 vec_store_elements_reversed (rp, 8, sum1);
235 break;
236 }
237
238 #ifdef BRCTG
239 for (; iterations > 0; iterations--)
240 {
241 #else
242 while (idx < idx_bound)
243 {
244 #endif
245 vec_t overlap_addend0;
246 vec_t overlap_addend1;
247
248 /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
249 * result in a GPR pair. One of the factors is taken from the GPR pair
250 * and overwritten.
251 * To reuse factors, it turned out cheaper to load limbs multiple times
252 * than copying GPR contents. Enforce that and the use of addressing by
253 * base + index gpr + immediate displacement via inline asm.
254 */
255 ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET);
256 ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET);
257 ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET);
258 ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET);
259
260 /*
261 * accumulate products as follows (for addmul):
262 * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] |
263 * p0_high | p0_low |
264 * p1_high | p1_low | carry-limb in
265 * p2_high | p2_low |
266 * c-limb out <- p3_high | p3_low |
267 * | < 128-bit VR > < 128-bit VR >
268 *
269 * < rp1_addend > < rp0_addend >
270 * carry-chain 0 <- + <- + <- carry_vec0[127]
271 * < product1 > < product0 >
272 * carry-chain 1 <- + <- + <- carry_vec1[127]
273 * < overlap_addend1 > < overlap_addend0 >
274 *
275 * note that a 128-bit add with carry in + out is built from two insns
276 * - vec_adde_u128 (vacq) provides sum
277 * - vec_addec_u128 (vacccq) provides the new carry bit
278 */
279
280 s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb);
281
282 /*
283 * "barrier" to enforce scheduling loads for all limbs and first round
284 * of MLGR before anything else.
285 */
286 asm volatile("");
287
288 product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
289
290 #ifdef ADD
291 rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET);
292 rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET);
293 #endif
294 /* increment loop index to unblock dependant loads of limbs for the next
295 * iteration (see above at #define LOOP_ADVANCE) */
296 idx += LOOP_ADVANCE;
297
298 s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb);
299
300 overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb);
301 asm volatile("");
302
303 #ifdef ADD
304 sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
305 sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
306
307 carry_vec0.sw
308 = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
309 carry_vec1.sw
310 = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
311 #else
312 sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
313 carry_vec0.sw
314 = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
315 #endif
316
317 asm volatile("");
318 product2.dw = vec_load_2di_as_pair (p2_high, p2_low);
319 overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high);
320
321 vec_t sum4;
322
323 #ifdef ADD
324 vec_t sum3;
325 sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
326 sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
327
328 carry_vec0.sw
329 = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
330 carry_vec1.sw
331 = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
332 #else
333 sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
334 carry_vec0.sw
335 = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
336 #endif
337 vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE,
338 sum1);
339 vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
340 sum4);
341
342 carry_limb = p3_high;
343 }
344
345 #ifdef ADD
346 carry_vec0.dw += carry_vec1.dw;
347 carry_limb += carry_vec0.dw[1];
348 #else
349 carry_limb += carry_vec0.dw[1];
350 #endif
351
352 return carry_limb;
353 }
354
355 #undef OPERATION_addmul_1
356 #undef OPERATION_mul_1
357 #undef FUNCNAME
358 #undef ADD