1 /* Common vector helpers and macros for IBM z13 and later
2
3 Copyright 2021 Free Software Foundation, Inc.
4
5 This file is part of the GNU MP Library.
6
7 The GNU MP Library is free software; you can redistribute it and/or modify
8 it under the terms of either:
9
10 * the GNU Lesser General Public License as published by the Free
11 Software Foundation; either version 3 of the License, or (at your
12 option) any later version.
13
14 or
15
16 * the GNU General Public License as published by the Free Software
17 Foundation; either version 2 of the License, or (at your option) any
18 later version.
19
20 or both in parallel, as here.
21
22 The GNU MP Library is distributed in the hope that it will be useful, but
23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 for more details.
26
27 You should have received copies of the GNU General Public License and the
28 GNU Lesser General Public License along with the GNU MP Library. If not,
29 see https://www.gnu.org/licenses/. */
30
31 #ifndef __S390_64_Z13_COMMON_VEC_H
32 #define __S390_64_Z13_COMMON_VEC_H
33
34 #include <unistd.h>
35 #include <vecintrin.h>
36
37 /*
38 * Vector intrinsics use vector element types that kind-of make sense for the
39 * specific operation (e.g., vec_permi permutes doublewords). To use VRs
40 * interchangeably with different intrinsics, typedef the two variants and wrap
41 * them in a union.
42 */
43 #define VLEN_BYTES 16
44 typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES)));
45 typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES)));
46
47 /*
48 * The Z vector intrinsics use vectors with different element types (e.g.,
49 * v16qi for the 128-bit adds and v2di for vec_permi).
50 */
51 union vec
52 {
53 v2di dw;
54 v16qi sw;
55 };
56
57 typedef union vec vec_t;
58
59 /*
60 * single-instruction combine of two GPRs into a VR
61 */
62 static inline v2di
63 vec_load_2di_as_pair (unsigned long a, unsigned long b)
64 {
65 v2di res;
66 __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b));
67 return res;
68 }
69
70 /*
71 * 64x64 mult where caller needs to care about proper register allocation:
72 * multiply xl with m1, treating both as unsigned, and place the result in
73 * xh:xl.
74 * mlgr operates on register pairs, so xh must be an even gpr followed by xl
75 */
76 #define s390_umul_ppmm(xh, xl, m1) \
77 do \
78 { \
79 asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \
80 } \
81 while (0);
82
83 /*
84 * two 64x64 multiplications, scheduled so that they will dispatch and issue to
85 * different sides: each mlgr is dispatched alone in an instruction group and
86 * subsequent groups will issue on different execution sides.
87 * there is a variant where both products use the same multiplicand and one
88 * that uses two different multiplicands. constraints from s390_umul_ppmm apply
89 * here.
90 */
91 #define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \
92 do \
93 { \
94 asm("mlgr\t%[x0h],%[mx]\n\t" \
95 "mlgr\t%[x1h],%[mx]" \
96 : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \
97 [x1l] "=r"(X1L) \
98 : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \
99 } \
100 while (0);
101
102 #define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \
103 do \
104 { \
105 asm("mlgr\t%[x0h],%[mx0]\n\t" \
106 "mlgr\t%[x1h],%[mx1]" \
107 : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \
108 [x1l] "=r"(X1L) \
109 : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \
110 } \
111 while (0);
112
113 #define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \
114 asm volatile("lg\t%[r],%[off](%[b])" \
115 : [r] "=r"(DST) \
116 : [b] "a"(BASE), [off] "L"(OFFSET) \
117 : "memory");
118
119 #define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \
120 asm volatile("lg\t%[r],%[off](%[b],%[x])" \
121 : [r] "=r"(DST) \
122 : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \
123 : "memory");
124
125 /*
126 * Load a vector register from memory and swap the two 64-bit doubleword
127 * elements.
128 */
129 static inline vec_t
130 vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index,
131 ssize_t const offset)
132 {
133 vec_t res;
134 char *ptr = (char *)base;
135
136 res.sw = *(v16qi *)(ptr + index + offset);
137 res.dw = vec_permi (res.dw, res.dw, 2);
138
139 return res;
140 }
141
142 static inline vec_t
143 vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset)
144 {
145 return vec_load_elements_reversed_idx (base, 0, offset);
146 }
147
148 /*
149 * Store a vector register to memory and swap the two 64-bit doubleword
150 * elements.
151 */
152 static inline void
153 vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index,
154 ssize_t const offset, vec_t vec)
155 {
156 char *ptr = (char *)base;
157
158 vec.dw = vec_permi (vec.dw, vec.dw, 2);
159 *(v16qi *)(ptr + index + offset) = vec.sw;
160 }
161
162 static inline void
163 vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec)
164 {
165 vec_store_elements_reversed_idx (base, 0, offset, vec);
166 }
167
168 #define ASM_VZERO(VEC) \
169 do \
170 { \
171 asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \
172 } \
173 while (0)
174
175 #endif