1 /* Tests of reduction on loop directive. */
2
3 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
4 aspects of that functionality. */
5
6 #include <assert.h>
7
8
9 /* Test of reduction on loop directive (gangs, non-private reduction
10 variable). */
11
12 void g_np_1()
13 {
14 int i, arr[1024], res = 0, hres = 0;
15
16 for (i = 0; i < 1024; i++)
17 arr[i] = i;
18
19 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
20 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
21 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
22 {
23 #pragma acc loop gang reduction(+:res)
24 for (i = 0; i < 1024; i++)
25 res += arr[i];
26 }
27
28 for (i = 0; i < 1024; i++)
29 hres += arr[i];
30
31 assert (res == hres);
32
33 res = hres = 1;
34
35 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
36 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
37 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
38 {
39 #pragma acc loop gang reduction(*:res)
40 for (i = 0; i < 12; i++)
41 res *= arr[i];
42 }
43
44 for (i = 0; i < 12; i++)
45 hres *= arr[i];
46
47 assert (res == hres);
48 }
49
50
51 /* Test of reduction on loop directive (gangs and vectors, non-private
52 reduction variable). */
53
54 void gv_np_1()
55 {
56 int i, arr[1024], res = 0, hres = 0;
57
58 for (i = 0; i < 1024; i++)
59 arr[i] = i;
60
61 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
62 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
63 {
64 #pragma acc loop gang vector reduction(+:res)
65 for (i = 0; i < 1024; i++)
66 res += arr[i];
67 }
68
69 for (i = 0; i < 1024; i++)
70 hres += arr[i];
71
72 assert (res == hres);
73 }
74
75
76 /* Test of reduction on loop directive (gangs and workers, non-private
77 reduction variable). */
78
79 void gw_np_1()
80 {
81 int i, arr[1024], res = 0, hres = 0;
82
83 for (i = 0; i < 1024; i++)
84 arr[i] = i;
85
86 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
87 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-1 } */
88 {
89 #pragma acc loop gang worker reduction(+:res)
90 for (i = 0; i < 1024; i++)
91 res += arr[i];
92 }
93
94 for (i = 0; i < 1024; i++)
95 hres += arr[i];
96
97 assert (res == hres);
98 }
99
100
101 /* Test of reduction on loop directive (gangs, workers and vectors, non-private
102 reduction variable). */
103
104 void gwv_np_1()
105 {
106 int i, arr[1024], res = 0, hres = 0;
107
108 for (i = 0; i < 1024; i++)
109 arr[i] = i;
110
111 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
112 {
113 #pragma acc loop gang worker vector reduction(+:res)
114 for (i = 0; i < 1024; i++)
115 res += arr[i];
116 }
117
118 for (i = 0; i < 1024; i++)
119 hres += arr[i];
120
121 assert (res == hres);
122 }
123
124
125 /* Test of reduction on loop directive (gangs, workers and vectors, non-private
126 reduction variable: separate gang and worker/vector loops). */
127
128 void gwv_np_2()
129 {
130 int i, j, arr[32768], res = 0, hres = 0;
131
132 for (i = 0; i < 32768; i++)
133 arr[i] = i;
134
135 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
136 {
137 #pragma acc loop gang reduction(+:res)
138 for (j = 0; j < 32; j++)
139 {
140 #pragma acc loop worker vector reduction(+:res)
141 for (i = 0; i < 1024; i++)
142 res += arr[j * 1024 + i];
143 }
144 /* "res" is non-private, and is not available until after the parallel
145 region. */
146 }
147
148 for (i = 0; i < 32768; i++)
149 hres += arr[i];
150
151 assert (res == hres);
152 }
153
154
155 /* Test of reduction on loop directive (gangs, workers and vectors, non-private
156 reduction variable: separate gang and worker/vector loops). */
157
158 void gwv_np_3()
159 {
160 int i, j;
161 double arr[32768], res = 0, hres = 0;
162
163 for (i = 0; i < 32768; i++)
164 arr[i] = i;
165
166 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
167 copyin(arr)
168 {
169 #pragma acc loop gang reduction(+:res)
170 for (j = 0; j < 32; j++)
171 {
172 #pragma acc loop worker vector reduction(+:res)
173 for (i = 0; i < 1024; i++)
174 res += arr[j * 1024 + i];
175 }
176 }
177
178 for (i = 0; i < 32768; i++)
179 hres += arr[i];
180
181 assert (res == hres);
182 }
183
184 #if ACC_DEVICE_TYPE_nvidia
185 /* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'. */
186 #define NUM_WORKERS 28
187 #else
188 #define NUM_WORKERS 32
189 #endif
190
191 /* Test of reduction on loop directive (gangs, workers and vectors, multiple
192 non-private reduction variables, float type). */
193
194 void gwv_np_4()
195 {
196 int i, j;
197 float arr[32768];
198 float res = 0, mres = 0, hres = 0, hmres = 0;
199
200 for (i = 0; i < 32768; i++)
201 arr[i] = i % (32768 / 64);
202
203 #pragma acc parallel num_gangs(32) num_workers(NUM_WORKERS) vector_length(32)
204 {
205 #pragma acc loop gang reduction(+:res) reduction(max:mres)
206 for (j = 0; j < 32; j++)
207 {
208 #pragma acc loop worker vector reduction(+:res) reduction(max:mres)
209 for (i = 0; i < 1024; i++)
210 {
211 res += arr[j * 1024 + i];
212 if (arr[j * 1024 + i] > mres)
213 mres = arr[j * 1024 + i];
214 }
215
216 #pragma acc loop worker vector reduction(+:res) reduction(max:mres)
217 for (i = 0; i < 1024; i++)
218 {
219 res += arr[j * 1024 + (1023 - i)];
220 if (arr[j * 1024 + (1023 - i)] > mres)
221 mres = arr[j * 1024 + (1023 - i)];
222 }
223 }
224 }
225
226 for (j = 0; j < 32; j++)
227 for (i = 0; i < 1024; i++)
228 {
229 hres += arr[j * 1024 + i];
230 hres += arr[j * 1024 + (1023 - i)];
231 if (arr[j * 1024 + i] > hmres)
232 hmres = arr[j * 1024 + i];
233 if (arr[j * 1024 + (1023 - i)] > hmres)
234 hmres = arr[j * 1024 + (1023 - i)];
235 }
236
237 assert (hres <= 16777216);
238 assert (res == hres);
239
240 assert (hmres <= 16777216);
241 assert (mres == hmres);
242 }
243
244 #undef NUM_WORKERS
245
246 /* Test of reduction on loop directive (vectors, private reduction
247 variable). */
248
249 void v_p_1()
250 {
251 int i, j, arr[1024], out[32], res = 0, hres = 0;
252
253 for (i = 0; i < 1024; i++)
254 arr[i] = i;
255
256 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
257 private(res) copyout(out)
258 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
259 {
260 #pragma acc loop gang
261 for (j = 0; j < 32; j++)
262 {
263 res = 0;
264
265 #pragma acc loop vector reduction(+:res)
266 for (i = 0; i < 32; i++)
267 res += arr[j * 32 + i];
268
269 out[j] = res;
270 }
271 }
272
273 for (j = 0; j < 32; j++)
274 {
275 hres = 0;
276
277 for (i = 0; i < 32; i++)
278 hres += arr[j * 32 + i];
279
280 assert (out[j] == hres);
281 }
282 }
283
284
285 /* Test of reduction on loop directive (vector reduction in
286 gang-partitioned/worker-partitioned mode, private reduction variable). */
287
288 void v_p_2()
289 {
290 int i, j, k;
291 double ina[1024], inb[1024], out[1024], acc;
292
293 for (j = 0; j < 32; j++)
294 for (i = 0; i < 32; i++)
295 {
296 ina[j * 32 + i] = (i == j) ? 2.0 : 0.0;
297 inb[j * 32 + i] = (double) (i + j);
298 }
299
300 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
301 private(acc) copyin(ina, inb) copyout(out)
302 {
303 #pragma acc loop gang worker
304 for (k = 0; k < 32; k++)
305 for (j = 0; j < 32; j++)
306 {
307 acc = 0;
308
309 #pragma acc loop vector reduction(+:acc)
310 for (i = 0; i < 32; i++)
311 acc += ina[k * 32 + i] * inb[i * 32 + j];
312
313 out[k * 32 + j] = acc;
314 }
315 }
316
317 for (j = 0; j < 32; j++)
318 for (i = 0; i < 32; i++)
319 assert (out[j * 32 + i] == (i + j) * 2);
320 }
321
322
323 /* Test of reduction on loop directive (workers, private reduction
324 variable). */
325
326 void w_p_1()
327 {
328 int i, j, arr[1024], out[32], res = 0, hres = 0;
329
330 for (i = 0; i < 1024; i++)
331 arr[i] = i;
332
333 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
334 private(res) copyout(out)
335 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
336 {
337 #pragma acc loop gang
338 for (j = 0; j < 32; j++)
339 {
340 res = 0;
341
342 #pragma acc loop worker reduction(+:res)
343 for (i = 0; i < 32; i++)
344 res += arr[j * 32 + i];
345
346 out[j] = res;
347 }
348 }
349
350 for (j = 0; j < 32; j++)
351 {
352 hres = 0;
353
354 for (i = 0; i < 32; i++)
355 hres += arr[j * 32 + i];
356
357 assert (out[j] == hres);
358 }
359 }
360
361
362 /* Test of reduction on loop directive (workers and vectors, private reduction
363 variable). */
364
365 void wv_p_1()
366 {
367 int i, j, arr[1024], out[32], res = 0, hres = 0;
368
369 for (i = 0; i < 1024; i++)
370 arr[i] = i;
371
372 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
373 private(res) copyout(out)
374 {
375 #pragma acc loop gang
376 for (j = 0; j < 32; j++)
377 {
378 res = 0;
379
380 #pragma acc loop worker vector reduction(+:res)
381 for (i = 0; i < 32; i++)
382 res += arr[j * 32 + i];
383
384 out[j] = res;
385 }
386 }
387
388 for (j = 0; j < 32; j++)
389 {
390 hres = 0;
391
392 for (i = 0; i < 32; i++)
393 hres += arr[j * 32 + i];
394
395 assert (out[j] == hres);
396 }
397 }
398
399
400 /* Test of reduction on loop directive (workers and vectors, private reduction
401 variable). */
402
403 void wv_p_2()
404 {
405 int i, j, arr[32768], out[32], res = 0, hres = 0;
406
407 for (i = 0; i < 32768; i++)
408 arr[i] = i;
409
410 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
411 private(res) copyout(out)
412 {
413 #pragma acc loop gang
414 for (j = 0; j < 32; j++)
415 {
416 res = j;
417
418 #pragma acc loop worker reduction(+:res)
419 for (i = 0; i < 1024; i++)
420 res += arr[j * 1024 + i];
421
422 #pragma acc loop vector reduction(+:res)
423 for (i = 1023; i >= 0; i--)
424 res += arr[j * 1024 + i];
425
426 out[j] = res;
427 }
428 }
429
430 for (j = 0; j < 32; j++)
431 {
432 hres = j;
433
434 for (i = 0; i < 1024; i++)
435 hres += arr[j * 1024 + i] * 2;
436
437 assert (out[j] == hres);
438 }
439 }
440
441
442 /* Test of reduction on loop directive (workers and vectors, private reduction
443 variable: gang-redundant mode). */
444
445 void wv_p_3()
446 {
447 int i, arr[1024], out[32], res = 0, hres = 0;
448
449 for (i = 0; i < 1024; i++)
450 arr[i] = i ^ 33;
451
452 #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
453 private(res) copyin(arr) copyout(out)
454 {
455 /* Private variables aren't initialized by default in openacc. */
456 res = 0;
457
458 /* "res" should be available at the end of the following loop (and should
459 have the same value redundantly in each gang). */
460 #pragma acc loop worker vector reduction(+:res)
461 for (i = 0; i < 1024; i++)
462 res += arr[i];
463
464 #pragma acc loop gang (static: 1)
465 for (i = 0; i < 32; i++)
466 out[i] = res;
467 }
468
469 for (i = 0; i < 1024; i++)
470 hres += arr[i];
471
472 for (i = 0; i < 32; i++)
473 assert (out[i] == hres);
474 }
475
476
477 int main()
478 {
479 g_np_1();
480 gv_np_1();
481 gw_np_1();
482 gwv_np_1();
483 gwv_np_2();
484 gwv_np_3();
485 gwv_np_4();
486 v_p_1();
487 v_p_2();
488 w_p_1();
489 wv_p_1();
490 wv_p_2();
491 wv_p_3();
492
493 return 0;
494 }