1 /* AMD GCN does not use 32-lane vectors.
2 { dg-skip-if "unsuitable dimensions" { openacc_radeon_accel_selected } { "*" } { "" } } */
3
4 /* { dg-additional-options "-fopenacc-dim=32" } */
5
6 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
7 aspects of that functionality. */
8
9 #include <stdio.h>
10 #include <openacc.h>
11 #include <gomp-constants.h>
12
13 int check (const int *ary, int size, int gp, int wp, int vp)
14 {
15 int exit = 0;
16 int ix;
17 int gangs[32], workers[32], vectors[32];
18
19 for (ix = 0; ix < 32; ix++)
20 gangs[ix] = workers[ix] = vectors[ix] = 0;
21
22 for (ix = 0; ix < size; ix++)
23 {
24 vectors[ary[ix] & 0xff]++;
25 workers[(ary[ix] >> 8) & 0xff]++;
26 gangs[(ary[ix] >> 16) & 0xff]++;
27 }
28
29 for (ix = 0; ix < 32; ix++)
30 {
31 if (gp)
32 {
33 int expect = gangs[0];
34 if (gangs[ix] != expect)
35 {
36 exit = 1;
37 printf ("gang %d not used %d times\n", ix, expect);
38 }
39 }
40 else if (ix && gangs[ix])
41 {
42 exit = 1;
43 printf ("gang %d unexpectedly used\n", ix);
44 }
45
46 if (wp)
47 {
48 int expect = workers[0];
49 if (workers[ix] != expect)
50 {
51 exit = 1;
52 printf ("worker %d not used %d times\n", ix, expect);
53 }
54 }
55 else if (ix && workers[ix])
56 {
57 exit = 1;
58 printf ("worker %d unexpectedly used\n", ix);
59 }
60
61 if (vp)
62 {
63 int expect = vectors[0];
64 if (vectors[ix] != expect)
65 {
66 exit = 1;
67 printf ("vector %d not used %d times\n", ix, expect);
68 }
69 }
70 else if (ix && vectors[ix])
71 {
72 exit = 1;
73 printf ("vector %d unexpectedly used\n", ix);
74 }
75
76 }
77 return exit;
78 }
79
80 #pragma acc routine seq
81 static int __attribute__((noinline)) place ()
82 {
83 int r = 0;
84
85 int g = 0, w = 0, v = 0;
86 g = __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
87 w = __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
88 v = __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
89 r = (g << 16) | (w << 8) | v;
90
91 return r;
92 }
93
94 static void clear (int *ary, int size)
95 {
96 int ix;
97
98 for (ix = 0; ix < size; ix++)
99 ary[ix] = -1;
100 }
101
102 int vector_1 (int *ary, int size)
103 {
104 clear (ary, size);
105
106 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
107 {
108 #pragma acc loop gang
109 for (int jx = 0; jx < 1; jx++)
110 #pragma acc loop auto
111 for (int ix = 0; ix < size; ix++)
112 ary[ix] = place ();
113 }
114
115 return check (ary, size, 0, 1, 1);
116 }
117
118 int vector_2 (int *ary, int size)
119 {
120 clear (ary, size);
121
122 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
123 {
124 #pragma acc loop worker
125 for (int jx = 0; jx < size / 64; jx++)
126 #pragma acc loop auto
127 for (int ix = 0; ix < 64; ix++)
128 ary[ix + jx * 64] = place ();
129 }
130
131 return check (ary, size, 0, 1, 1);
132 }
133
134 int worker_1 (int *ary, int size)
135 {
136 clear (ary, size);
137
138 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
139 {
140 #pragma acc loop gang
141 for (int kx = 0; kx < 1; kx++)
142 #pragma acc loop auto
143 for (int jx = 0; jx < size / 64; jx++)
144 #pragma acc loop vector
145 for (int ix = 0; ix < 64; ix++)
146 ary[ix + jx * 64] = place ();
147 }
148
149 return check (ary, size, 0, 1, 1);
150 }
151
152 int gang_1 (int *ary, int size)
153 {
154 clear (ary, size);
155
156 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
157 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-1 } */
158 {
159 #pragma acc loop auto
160 for (int jx = 0; jx < size / 64; jx++)
161 #pragma acc loop worker
162 for (int ix = 0; ix < 64; ix++)
163 ary[ix + jx * 64] = place ();
164 }
165
166 return check (ary, size, 1, 1, 0);
167 }
168
169 int gang_2 (int *ary, int size)
170 {
171 clear (ary, size);
172
173 #pragma acc parallel num_gangs (32) num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
174 {
175 #pragma acc loop auto
176 for (int kx = 0; kx < size / (32 * 32); kx++)
177 #pragma acc loop auto
178 for (int jx = 0; jx < 32; jx++)
179 #pragma acc loop auto
180 for (int ix = 0; ix < 32; ix++)
181 ary[ix + jx * 32 + kx * 32 * 32] = place ();
182 }
183
184 return check (ary, size, 1, 1, 1);
185 }
186
187 int gang_3 (int *ary, int size)
188 {
189 clear (ary, size);
190
191 #pragma acc parallel num_workers (32) vector_length(32) copy(ary[0:size]) firstprivate (size)
192 {
193 #pragma acc loop auto
194 for (int jx = 0; jx < size / 64; jx++)
195 #pragma acc loop auto
196 for (int ix = 0; ix < 64; ix++)
197 ary[ix + jx * 64] = place ();
198 }
199
200 return check (ary, size, 1, 1, 1);
201 }
202
203 int gang_4 (int *ary, int size)
204 {
205 clear (ary, size);
206
207 #pragma acc parallel vector_length(32) copy(ary[0:size]) firstprivate (size)
208 {
209 #pragma acc loop auto
210 for (int jx = 0; jx < size; jx++)
211 ary[jx] = place ();
212 }
213
214 return check (ary, size, 1, 0, 1);
215 }
216
217 #define N (32*32*32*2)
218 int main ()
219 {
220 int ondev = 0;
221
222 #pragma acc parallel copy(ondev)
223 {
224 ondev = acc_on_device (acc_device_not_host);
225 }
226 if (!ondev)
227 return 0;
228
229 int ary[N];
230
231 if (vector_1 (ary, N))
232 return 1;
233 if (vector_2 (ary, N))
234 return 1;
235
236 if (worker_1 (ary, N))
237 return 1;
238
239 if (gang_1 (ary, N))
240 return 1;
241 if (gang_2 (ary, N))
242 return 1;
243 if (gang_3 (ary, N))
244 return 1;
245 if (gang_4 (ary, N))
246 return 1;
247
248 return 0;
249 }