1 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
2 aspects of that functionality. */
3
4 /* Miscellaneous test cases for gang/worker/vector mode transitions. */
5
6 #include <assert.h>
7 #include <stdbool.h>
8 #include <stdlib.h>
9 #include <math.h>
10 #include <openacc.h>
11
12
13 /* Test basic vector-partitioned mode transitions. */
14
15 void t1()
16 {
17 int n = 0, arr[32], i;
18
19 for (i = 0; i < 32; i++)
20 arr[i] = 0;
21
22 #pragma acc parallel copy(n, arr) \
23 num_gangs(1) num_workers(1) vector_length(32)
24 {
25 int j;
26 n++;
27 #pragma acc loop vector
28 for (j = 0; j < 32; j++)
29 arr[j]++;
30 n++;
31 }
32
33 assert (n == 2);
34
35 for (i = 0; i < 32; i++)
36 assert (arr[i] == 1);
37 }
38
39
40 /* Test vector-partitioned, gang-partitioned mode. */
41
42 void t2()
43 {
44 int n[32], arr[1024], i;
45
46 for (i = 0; i < 1024; i++)
47 arr[i] = 0;
48
49 for (i = 0; i < 32; i++)
50 n[i] = 0;
51
52 #pragma acc parallel copy(n, arr) \
53 num_gangs(32) num_workers(1) vector_length(32)
54 {
55 int j, k;
56
57 #pragma acc loop gang(static:*)
58 for (j = 0; j < 32; j++)
59 n[j]++;
60
61 #pragma acc loop gang
62 for (j = 0; j < 32; j++)
63 #pragma acc loop vector
64 for (k = 0; k < 32; k++)
65 arr[j * 32 + k]++;
66
67 #pragma acc loop gang(static:*)
68 for (j = 0; j < 32; j++)
69 n[j]++;
70 }
71
72 for (i = 0; i < 32; i++)
73 assert (n[i] == 2);
74
75 for (i = 0; i < 1024; i++)
76 assert (arr[i] == 1);
77 }
78
79
80 /* Test conditional vector-partitioned loops. */
81
82 void t3()
83 {
84 int n[32], arr[1024], i;
85
86 for (i = 0; i < 1024; i++)
87 arr[i] = 0;
88
89 for (i = 0; i < 32; i++)
90 n[i] = 0;
91
92 #pragma acc parallel copy(n, arr) \
93 num_gangs(32) num_workers(1) vector_length(32)
94 {
95 int j, k;
96
97 #pragma acc loop gang(static:*)
98 for (j = 0; j < 32; j++)
99 n[j]++;
100
101 #pragma acc loop gang
102 for (j = 0; j < 32; j++)
103 {
104 if ((j % 2) == 0)
105 {
106 #pragma acc loop vector
107 for (k = 0; k < 32; k++)
108 arr[j * 32 + k]++;
109 }
110 else
111 {
112 #pragma acc loop vector
113 for (k = 0; k < 32; k++)
114 arr[j * 32 + k]--;
115 }
116 }
117
118 #pragma acc loop gang(static:*)
119 for (j = 0; j < 32; j++)
120 n[j]++;
121 }
122
123 for (i = 0; i < 32; i++)
124 assert (n[i] == 2);
125
126 for (i = 0; i < 1024; i++)
127 assert (arr[i] == (((i % 64) < 32) ? 1 : -1));
128 }
129
130
131 /* Test conditions inside vector-partitioned loops. */
132
133 void t4()
134 {
135 int n[32], arr[1024], i;
136
137 for (i = 0; i < 1024; i++)
138 arr[i] = i;
139
140 for (i = 0; i < 32; i++)
141 n[i] = 0;
142
143 #pragma acc parallel copy(n, arr) \
144 num_gangs(32) num_workers(1) vector_length(32)
145 {
146 int j, k;
147
148 #pragma acc loop gang(static:*)
149 for (j = 0; j < 32; j++)
150 n[j]++;
151
152 #pragma acc loop gang
153 for (j = 0; j < 32; j++)
154 {
155 #pragma acc loop vector
156 for (k = 0; k < 32; k++)
157 if ((arr[j * 32 + k] % 2) != 0)
158 arr[j * 32 + k] *= 2;
159 }
160
161 #pragma acc loop gang(static:*)
162 for (j = 0; j < 32; j++)
163 n[j]++;
164 }
165
166 for (i = 0; i < 32; i++)
167 assert (n[i] == 2);
168
169 for (i = 0; i < 1024; i++)
170 assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
171 }
172
173
174 /* Test conditions inside gang-partitioned/vector-partitioned loops. */
175
176 void t5()
177 {
178 int n[32], arr[1024], i;
179
180 for (i = 0; i < 1024; i++)
181 arr[i] = i;
182
183 for (i = 0; i < 32; i++)
184 n[i] = 0;
185
186 #pragma acc parallel copy(n, arr) \
187 num_gangs(32) num_workers(1) vector_length(32)
188 {
189 int j;
190
191 #pragma acc loop gang(static:*)
192 for (j = 0; j < 32; j++)
193 n[j]++;
194
195 #pragma acc loop gang vector
196 for (j = 0; j < 1024; j++)
197 if ((arr[j] % 2) != 0)
198 arr[j] *= 2;
199
200 #pragma acc loop gang(static:*)
201 for (j = 0; j < 32; j++)
202 n[j]++;
203 }
204
205 for (i = 0; i < 32; i++)
206 assert (n[i] == 2);
207
208 for (i = 0; i < 1024; i++)
209 assert (arr[i] == ((i % 2) == 0 ? i : i * 2));
210 }
211
212
213 /* Test switch containing vector-partitioned loops inside gang-partitioned
214 loops. */
215
216 void t6()
217 {
218 int n[32], arr[1024], i;
219
220 for (i = 0; i < 1024; i++)
221 arr[i] = 0;
222
223 for (i = 0; i < 32; i++)
224 n[i] = i % 5;
225
226 #pragma acc parallel copy(n, arr) \
227 num_gangs(32) num_workers(1) vector_length(32)
228 {
229 int j, k;
230
231 #pragma acc loop gang(static:*)
232 for (j = 0; j < 32; j++)
233 n[j]++;
234
235 #pragma acc loop gang(static:*)
236 for (j = 0; j < 32; j++)
237 switch (n[j])
238 {
239 case 1:
240 #pragma acc loop vector
241 for (k = 0; k < 32; k++)
242 arr[j * 32 + k] += 1;
243 break;
244
245 case 2:
246 #pragma acc loop vector
247 for (k = 0; k < 32; k++)
248 arr[j * 32 + k] += 2;
249 break;
250
251 case 3:
252 #pragma acc loop vector
253 for (k = 0; k < 32; k++)
254 arr[j * 32 + k] += 3;
255 break;
256
257 case 4:
258 #pragma acc loop vector
259 for (k = 0; k < 32; k++)
260 arr[j * 32 + k] += 4;
261 break;
262
263 case 5:
264 #pragma acc loop vector
265 for (k = 0; k < 32; k++)
266 arr[j * 32 + k] += 5;
267 break;
268
269 default:
270 abort ();
271 }
272
273 #pragma acc loop gang(static:*)
274 for (j = 0; j < 32; j++)
275 n[j]++;
276 }
277
278 for (i = 0; i < 32; i++)
279 assert (n[i] == (i % 5) + 2);
280
281 for (i = 0; i < 1024; i++)
282 assert (arr[i] == ((i / 32) % 5) + 1);
283 }
284
285
286 /* Test trivial operation of vector-single mode. */
287
288 void t7()
289 {
290 int n = 0;
291 #pragma acc parallel copy(n) \
292 num_gangs(1) num_workers(1) vector_length(32)
293 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
294 {
295 n++;
296 }
297 assert (n == 1);
298 }
299
300
301 /* Test vector-single, gang-partitioned mode. */
302
303 void t8()
304 {
305 int arr[1024];
306 int gangs;
307
308 for (gangs = 1; gangs <= 1024; gangs <<= 1)
309 {
310 int i;
311
312 for (i = 0; i < 1024; i++)
313 arr[i] = 0;
314
315 #pragma acc parallel copy(arr) \
316 num_gangs(gangs) num_workers(1) vector_length(32)
317 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
318 {
319 int j;
320 #pragma acc loop gang
321 for (j = 0; j < 1024; j++)
322 arr[j]++;
323 }
324
325 for (i = 0; i < 1024; i++)
326 assert (arr[i] == 1);
327 }
328 }
329
330
331 /* Test conditions in vector-single mode. */
332
333 void t9()
334 {
335 int arr[1024];
336 int gangs;
337
338 for (gangs = 1; gangs <= 1024; gangs <<= 1)
339 {
340 int i;
341
342 for (i = 0; i < 1024; i++)
343 arr[i] = 0;
344
345 #pragma acc parallel copy(arr) \
346 num_gangs(gangs) num_workers(1) vector_length(32)
347 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
348 {
349 int j;
350 #pragma acc loop gang
351 for (j = 0; j < 1024; j++)
352 if ((j % 3) == 0)
353 arr[j]++;
354 else
355 arr[j] += 2;
356 }
357
358 for (i = 0; i < 1024; i++)
359 assert (arr[i] == ((i % 3) == 0 ? 1 : 2));
360 }
361 }
362
363
364 /* Test switch in vector-single mode. */
365
366 void t10()
367 {
368 int arr[1024];
369 int gangs;
370
371 for (gangs = 1; gangs <= 1024; gangs <<= 1)
372 {
373 int i;
374
375 for (i = 0; i < 1024; i++)
376 arr[i] = 0;
377
378 #pragma acc parallel copy(arr) \
379 num_gangs(gangs) num_workers(1) vector_length(32)
380 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
381 {
382 int j;
383 #pragma acc loop gang
384 for (j = 0; j < 1024; j++)
385 switch (j % 5)
386 {
387 case 0: arr[j] += 1; break;
388 case 1: arr[j] += 2; break;
389 case 2: arr[j] += 3; break;
390 case 3: arr[j] += 4; break;
391 case 4: arr[j] += 5; break;
392 default: arr[j] += 99;
393 }
394 }
395
396 for (i = 0; i < 1024; i++)
397 assert (arr[i] == (i % 5) + 1);
398 }
399 }
400
401
402 /* Test switch in vector-single mode, initialise array on device. */
403
404 void t11()
405 {
406 int arr[1024];
407 int i;
408
409 for (i = 0; i < 1024; i++)
410 arr[i] = 99;
411
412 #pragma acc parallel copy(arr) \
413 num_gangs(1024) num_workers(1) vector_length(32)
414 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
415 {
416 int j;
417
418 /* This loop and the one following must be distributed to available gangs
419 in the same way to ensure data dependencies are not violated (hence the
420 "static" clauses). */
421 #pragma acc loop gang(static:*)
422 for (j = 0; j < 1024; j++)
423 arr[j] = 0;
424
425 #pragma acc loop gang(static:*)
426 for (j = 0; j < 1024; j++)
427 switch (j % 5)
428 {
429 case 0: arr[j] += 1; break;
430 case 1: arr[j] += 2; break;
431 case 2: arr[j] += 3; break;
432 case 3: arr[j] += 4; break;
433 case 4: arr[j] += 5; break;
434 default: arr[j] += 99;
435 }
436 }
437
438 for (i = 0; i < 1024; i++)
439 assert (arr[i] == (i % 5) + 1);
440 }
441
442
443 /* Test multiple conditions in vector-single mode. */
444
445 #define NUM_GANGS 4096
446 void t12()
447 {
448 bool fizz[NUM_GANGS], buzz[NUM_GANGS], fizzbuzz[NUM_GANGS];
449 int i;
450
451 #pragma acc parallel copyout(fizz, buzz, fizzbuzz) \
452 num_gangs(NUM_GANGS) num_workers(1) vector_length(32)
453 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
454 {
455 int j;
456
457 /* This loop and the one following must be distributed to available gangs
458 in the same way to ensure data dependencies are not violated (hence the
459 "static" clauses). */
460 #pragma acc loop gang(static:*)
461 for (j = 0; j < NUM_GANGS; j++)
462 fizz[j] = buzz[j] = fizzbuzz[j] = 0;
463
464 #pragma acc loop gang(static:*)
465 for (j = 0; j < NUM_GANGS; j++)
466 {
467 if ((j % 3) == 0 && (j % 5) == 0)
468 fizzbuzz[j] = 1;
469 else
470 {
471 if ((j % 3) == 0)
472 fizz[j] = 1;
473 else if ((j % 5) == 0)
474 buzz[j] = 1;
475 }
476 }
477 }
478
479 for (i = 0; i < NUM_GANGS; i++)
480 {
481 assert (fizzbuzz[i] == ((i % 3) == 0 && (i % 5) == 0));
482 assert (fizz[i] == ((i % 3) == 0 && (i % 5) != 0));
483 assert (buzz[i] == ((i % 3) != 0 && (i % 5) == 0));
484 }
485 }
486 #undef NUM_GANGS
487
488
489 /* Test worker-partitioned/vector-single mode. */
490
491 void t13()
492 {
493 int arr[32 * 8], i;
494
495 for (i = 0; i < 32 * 8; i++)
496 arr[i] = 0;
497
498 #pragma acc parallel copy(arr) \
499 num_gangs(8) num_workers(8) vector_length(32)
500 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
501 {
502 int j;
503 #pragma acc loop gang
504 for (j = 0; j < 32; j++)
505 {
506 int k;
507 #pragma acc loop worker
508 for (k = 0; k < 8; k++)
509 arr[j * 8 + k] += j * 8 + k;
510 }
511 }
512
513 for (i = 0; i < 32 * 8; i++)
514 assert (arr[i] == i);
515 }
516
517
518 /* Test condition in worker-partitioned mode. */
519
520 void t14()
521 {
522 int arr[32 * 32 * 8], i;
523
524 for (i = 0; i < 32 * 32 * 8; i++)
525 arr[i] = i;
526
527 #pragma acc parallel copy(arr) \
528 num_gangs(8) num_workers(8) vector_length(32)
529 {
530 int j;
531 #pragma acc loop gang
532 for (j = 0; j < 32; j++)
533 {
534 int k;
535 #pragma acc loop worker
536 for (k = 0; k < 8; k++)
537 {
538 int m;
539 if ((k % 2) == 0)
540 {
541 #pragma acc loop vector
542 for (m = 0; m < 32; m++)
543 arr[j * 32 * 8 + k * 32 + m]++;
544 }
545 else
546 {
547 #pragma acc loop vector
548 for (m = 0; m < 32; m++)
549 arr[j * 32 * 8 + k * 32 + m] += 2;
550 }
551 }
552 }
553 }
554
555 for (i = 0; i < 32 * 32 * 8; i++)
556 assert (arr[i] == i + ((i / 32) % 2) + 1);
557 }
558
559
560 /* Test switch in worker-partitioned mode. */
561
562 void t15()
563 {
564 int arr[32 * 32 * 8], i;
565
566 for (i = 0; i < 32 * 32 * 8; i++)
567 arr[i] = i;
568
569 #pragma acc parallel copy(arr) \
570 num_gangs(8) num_workers(8) vector_length(32)
571 {
572 int j;
573 #pragma acc loop gang
574 for (j = 0; j < 32; j++)
575 {
576 int k;
577 #pragma acc loop worker
578 for (k = 0; k < 8; k++)
579 {
580 int m;
581 switch ((j * 32 + k) % 3)
582 {
583 case 0:
584 #pragma acc loop vector
585 for (m = 0; m < 32; m++)
586 arr[j * 32 * 8 + k * 32 + m]++;
587 break;
588
589 case 1:
590 #pragma acc loop vector
591 for (m = 0; m < 32; m++)
592 arr[j * 32 * 8 + k * 32 + m] += 2;
593 break;
594
595 case 2:
596 #pragma acc loop vector
597 for (m = 0; m < 32; m++)
598 arr[j * 32 * 8 + k * 32 + m] += 3;
599 break;
600
601 default: ;
602 }
603 }
604 }
605 }
606
607 for (i = 0; i < 32 * 32 * 8; i++)
608 assert (arr[i] == i + ((i / 32) % 3) + 1);
609 }
610
611
612 /* Test worker-single/worker-partitioned transitions. */
613
614 void t16()
615 {
616 int n[32], arr[32 * 32], i;
617
618 for (i = 0; i < 32 * 32; i++)
619 arr[i] = 0;
620
621 for (i = 0; i < 32; i++)
622 n[i] = 0;
623
624 #pragma acc parallel copy(n, arr) \
625 num_gangs(8) num_workers(16) vector_length(32)
626 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
627 {
628 int j;
629 #pragma acc loop gang
630 for (j = 0; j < 32; j++)
631 {
632 int k;
633
634 n[j]++;
635
636 #pragma acc loop worker
637 for (k = 0; k < 32; k++)
638 arr[j * 32 + k]++;
639
640 n[j]++;
641
642 #pragma acc loop worker
643 for (k = 0; k < 32; k++)
644 arr[j * 32 + k]++;
645
646 n[j]++;
647
648 #pragma acc loop worker
649 for (k = 0; k < 32; k++)
650 arr[j * 32 + k]++;
651
652 n[j]++;
653 }
654 }
655
656 for (i = 0; i < 32; i++)
657 assert (n[i] == 4);
658
659 for (i = 0; i < 32 * 32; i++)
660 assert (arr[i] == 3);
661 }
662
663
664 /* Test correct synchronisation between worker-partitioned loops. */
665
666 void t17()
667 {
668 int arr_a[32 * 32], arr_b[32 * 32], i;
669 int num_workers, num_gangs;
670
671 for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
672 for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
673 {
674 for (i = 0; i < 32 * 32; i++)
675 arr_a[i] = i;
676
677 #pragma acc parallel copyin(arr_a) copyout(arr_b) \
678 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
679 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
680 {
681 int j;
682 #pragma acc loop gang
683 for (j = 0; j < 32; j++)
684 {
685 int k;
686
687 #pragma acc loop worker
688 for (k = 0; k < 32; k++)
689 arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
690
691 #pragma acc loop worker
692 for (k = 0; k < 32; k++)
693 arr_a[j * 32 + (31 - k)] = arr_b[j * 32 + k] * 2;
694
695 #pragma acc loop worker
696 for (k = 0; k < 32; k++)
697 arr_b[j * 32 + (31 - k)] = arr_a[j * 32 + k] * 2;
698 }
699 }
700
701 for (i = 0; i < 32 * 32; i++)
702 assert (arr_b[i] == (i ^ 31) * 8);
703 }
704 }
705
706
707 /* Test correct synchronisation between worker+vector-partitioned loops. */
708
709 void t18()
710 {
711 int arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
712 int num_workers, num_gangs;
713
714 for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
715 for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
716 {
717 for (i = 0; i < 32 * 32 * 32; i++)
718 arr_a[i] = i;
719
720 #pragma acc parallel copyin(arr_a) copyout(arr_b) \
721 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
722 {
723 int j;
724 #pragma acc loop gang
725 for (j = 0; j < 32; j++)
726 {
727 int k;
728
729 #pragma acc loop worker vector
730 for (k = 0; k < 32 * 32; k++)
731 arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
732
733 #pragma acc loop worker vector
734 for (k = 0; k < 32 * 32; k++)
735 arr_a[j * 32 * 32 + (1023 - k)] = arr_b[j * 32 * 32 + k] * 2;
736
737 #pragma acc loop worker vector
738 for (k = 0; k < 32 * 32; k++)
739 arr_b[j * 32 * 32 + (1023 - k)] = arr_a[j * 32 * 32 + k] * 2;
740 }
741 }
742
743 for (i = 0; i < 32 * 32 * 32; i++)
744 assert (arr_b[i] == (i ^ 1023) * 8);
745 }
746 }
747
748
749 /* Test correct synchronisation between vector-partitioned loops in
750 worker-partitioned mode. */
751
752 void t19()
753 {
754 int n[32 * 32], arr_a[32 * 32 * 32], arr_b[32 * 32 * 32], i;
755 int num_workers, num_gangs;
756
757 for (num_workers = 1; num_workers <= 32; num_workers <<= 1)
758 for (num_gangs = 1; num_gangs <= 32; num_gangs <<= 1)
759 {
760 for (i = 0; i < 32 * 32 * 32; i++)
761 arr_a[i] = i;
762
763 for (i = 0; i < 32 * 32; i++)
764 n[i] = 0;
765
766 #pragma acc parallel copy (n) copyin(arr_a) copyout(arr_b) \
767 num_gangs(num_gangs) num_workers(num_workers) vector_length(32)
768 {
769 int j;
770 #pragma acc loop gang
771 for (j = 0; j < 32; j++)
772 {
773 int k;
774
775 #pragma acc loop worker
776 for (k = 0; k < 32; k++)
777 {
778 int m;
779
780 n[j * 32 + k]++;
781
782 #pragma acc loop vector
783 for (m = 0; m < 32; m++)
784 {
785 if (((j * 1024 + k * 32 + m) % 2) == 0)
786 arr_b[j * 1024 + k * 32 + (31 - m)]
787 = arr_a[j * 1024 + k * 32 + m] * 2;
788 else
789 arr_b[j * 1024 + k * 32 + (31 - m)]
790 = arr_a[j * 1024 + k * 32 + m] * 3;
791 }
792
793 /* Test returning to vector-single mode... */
794 n[j * 32 + k]++;
795
796 #pragma acc loop vector
797 for (m = 0; m < 32; m++)
798 {
799 if (((j * 1024 + k * 32 + m) % 3) == 0)
800 arr_a[j * 1024 + k * 32 + (31 - m)]
801 = arr_b[j * 1024 + k * 32 + m] * 5;
802 else
803 arr_a[j * 1024 + k * 32 + (31 - m)]
804 = arr_b[j * 1024 + k * 32 + m] * 7;
805 }
806
807 /* ...and back-to-back vector loops. */
808
809 #pragma acc loop vector
810 for (m = 0; m < 32; m++)
811 {
812 if (((j * 1024 + k * 32 + m) % 2) == 0)
813 arr_b[j * 1024 + k * 32 + (31 - m)]
814 = arr_a[j * 1024 + k * 32 + m] * 3;
815 else
816 arr_b[j * 1024 + k * 32 + (31 - m)]
817 = arr_a[j * 1024 + k * 32 + m] * 2;
818 }
819 }
820 }
821 }
822
823 for (i = 0; i < 32 * 32; i++)
824 assert (n[i] == 2);
825
826 for (i = 0; i < 32 * 32 * 32; i++)
827 {
828 int m = 6 * ((i % 3) == 0 ? 5 : 7);
829 assert (arr_b[i] == (i ^ 31) * m);
830 }
831 }
832 }
833
834
835 /* With -O0, variables are on the stack, not in registers. Check that worker
836 state propagation handles the stack frame. */
837
838 void t20()
839 {
840 int w0 = 0;
841 int w1 = 0;
842 int w2 = 0;
843 int w3 = 0;
844 int w4 = 0;
845 int w5 = 0;
846 int w6 = 0;
847 int w7 = 0;
848
849 int i;
850
851 #pragma acc parallel copy (w0, w1, w2, w3, w4, w5, w6, w7) \
852 num_gangs (1) num_workers (8)
853 {
854 int internal = 100;
855
856 #pragma acc loop worker
857 for (i = 0; i < 8; i++)
858 {
859 switch (i)
860 {
861 case 0: w0 = internal; break;
862 case 1: w1 = internal; break;
863 case 2: w2 = internal; break;
864 case 3: w3 = internal; break;
865 case 4: w4 = internal; break;
866 case 5: w5 = internal; break;
867 case 6: w6 = internal; break;
868 case 7: w7 = internal; break;
869 default: break;
870 }
871 }
872 }
873
874 if (w0 != 100
875 || w1 != 100
876 || w2 != 100
877 || w3 != 100
878 || w4 != 100
879 || w5 != 100
880 || w6 != 100
881 || w7 != 100)
882 __builtin_abort ();
883 }
884
885
886 /* Test worker-single/vector-single mode. */
887
888 void t21()
889 {
890 int arr[32], i;
891
892 for (i = 0; i < 32; i++)
893 arr[i] = 0;
894
895 #pragma acc parallel copy(arr) \
896 num_gangs(8) num_workers(8) vector_length(32)
897 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
898 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-3 } */
899 {
900 int j;
901 #pragma acc loop gang
902 for (j = 0; j < 32; j++)
903 arr[j]++;
904 }
905
906 for (i = 0; i < 32; i++)
907 assert (arr[i] == 1);
908 }
909
910
911 /* Test worker-single/vector-single mode. */
912
913 void t22()
914 {
915 int arr[32], i;
916
917 for (i = 0; i < 32; i++)
918 arr[i] = 0;
919
920 #pragma acc parallel copy(arr) \
921 num_gangs(8) num_workers(8) vector_length(32)
922 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
923 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-3 } */
924 {
925 int j;
926 #pragma acc loop gang
927 for (j = 0; j < 32; j++)
928 {
929 #pragma acc atomic
930 arr[j]++;
931 }
932 }
933
934 for (i = 0; i < 32; i++)
935 assert (arr[i] == 1);
936 }
937
938
939 /* Test condition in worker-single/vector-single mode. */
940
941 void t23()
942 {
943 int arr[32], i;
944
945 for (i = 0; i < 32; i++)
946 arr[i] = i;
947
948 #pragma acc parallel copy(arr) \
949 num_gangs(8) num_workers(8) vector_length(32)
950 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
951 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-3 } */
952 {
953 int j;
954 #pragma acc loop gang
955 for (j = 0; j < 32; j++)
956 if ((arr[j] % 2) != 0)
957 arr[j]++;
958 else
959 arr[j] += 2;
960 }
961
962 for (i = 0; i < 32; i++)
963 assert (arr[i] == (((i % 2) != 0) ? i + 1 : i + 2));
964 }
965
966
967 /* Test switch in worker-single/vector-single mode. */
968
969 void t24()
970 {
971 int arr[32], i;
972
973 for (i = 0; i < 32; i++)
974 arr[i] = i;
975
976 #pragma acc parallel copy(arr) \
977 num_gangs(8) num_workers(8) vector_length(32)
978 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
979 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-3 } */
980 {
981 int j;
982 #pragma acc loop gang
983 for (j = 0; j < 32; j++)
984 switch (arr[j] % 5)
985 {
986 case 0: arr[j] += 1; break;
987 case 1: arr[j] += 2; break;
988 case 2: arr[j] += 3; break;
989 case 3: arr[j] += 4; break;
990 case 4: arr[j] += 5; break;
991 default: arr[j] += 99;
992 }
993 }
994
995 for (i = 0; i < 32; i++)
996 assert (arr[i] == i + (i % 5) + 1);
997 }
998
999
1000 /* Test worker-single/vector-partitioned mode. */
1001
1002 void t25()
1003 {
1004 int arr[32 * 32], i;
1005
1006 for (i = 0; i < 32 * 32; i++)
1007 arr[i] = i;
1008
1009 #pragma acc parallel copy(arr) \
1010 num_gangs(8) num_workers(8) vector_length(32)
1011 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
1012 {
1013 int j;
1014 #pragma acc loop gang
1015 for (j = 0; j < 32; j++)
1016 {
1017 int k;
1018 #pragma acc loop vector
1019 for (k = 0; k < 32; k++)
1020 {
1021 #pragma acc atomic
1022 arr[j * 32 + k]++;
1023 }
1024 }
1025 }
1026
1027 for (i = 0; i < 32 * 32; i++)
1028 assert (arr[i] == i + 1);
1029 }
1030
1031
1032 /* Test multiple conditional vector-partitioned loops in worker-single
1033 mode. */
1034
1035 void t26()
1036 {
1037 int arr[32 * 32], i;
1038
1039 for (i = 0; i < 32 * 32; i++)
1040 arr[i] = i;
1041
1042 #pragma acc parallel copy(arr) \
1043 num_gangs(8) num_workers(8) vector_length(32)
1044 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
1045 {
1046 int j;
1047 #pragma acc loop gang
1048 for (j = 0; j < 32; j++)
1049 {
1050 int k;
1051 if ((j % 3) == 0)
1052 {
1053 #pragma acc loop vector
1054 for (k = 0; k < 32; k++)
1055 {
1056 #pragma acc atomic
1057 arr[j * 32 + k] += 3;
1058 }
1059 }
1060 else if ((j % 3) == 1)
1061 {
1062 #pragma acc loop vector
1063 for (k = 0; k < 32; k++)
1064 {
1065 #pragma acc atomic
1066 arr[j * 32 + k] += 7;
1067 }
1068 }
1069 }
1070 }
1071
1072 for (i = 0; i < 32 * 32; i++)
1073 {
1074 int j = (i / 32) % 3;
1075 assert (arr[i] == i + ((j == 0) ? 3 : (j == 1) ? 7 : 0));
1076 }
1077 }
1078
1079
1080 /* Test worker-single, vector-partitioned, gang-redundant mode. */
1081
1082 #define ACTUAL_GANGS 8
1083 void t27()
1084 {
1085 int n, arr[32], i;
1086 int ondev;
1087
1088 for (i = 0; i < 32; i++)
1089 arr[i] = 0;
1090
1091 n = 0;
1092
1093 #pragma acc parallel copy(n, arr) copyout(ondev) \
1094 num_gangs(ACTUAL_GANGS) num_workers(8) vector_length(32)
1095 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .-2 } */
1096 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-3 } */
1097 {
1098 int j;
1099
1100 ondev = acc_on_device (acc_device_not_host);
1101
1102 #pragma acc atomic
1103 n++;
1104
1105 #pragma acc loop vector
1106 for (j = 0; j < 32; j++)
1107 {
1108 #pragma acc atomic
1109 arr[j] += 1;
1110 }
1111
1112 #pragma acc atomic
1113 n++;
1114 }
1115
1116 int m = ondev ? ACTUAL_GANGS : 1;
1117
1118 assert (n == m * 2);
1119
1120 for (i = 0; i < 32; i++)
1121 assert (arr[i] == m);
1122 }
1123 #undef ACTUAL_GANGS
1124
1125
1126 /* Check if worker-single variables get broadcastd to vectors. */
1127
1128 #pragma acc routine
1129 float t28_routine ()
1130 {
1131 return 2.71;
1132 }
1133
1134 #define N 32
1135 void t28()
1136 {
1137 float threads[N], v1 = 3.14;
1138
1139 for (int i = 0; i < N; i++)
1140 threads[i] = -1;
1141
1142 #pragma acc parallel num_gangs (1) vector_length (32) copy (v1)
1143 {
1144 float val = t28_routine ();
1145
1146 #pragma acc loop vector
1147 for (int i = 0; i < N; i++)
1148 threads[i] = val + v1*i;
1149 }
1150
1151 for (int i = 0; i < N; i++)
1152 assert (fabs (threads[i] - (t28_routine () + v1*i)) < 0.0001);
1153 }
1154 #undef N
1155
1156
1157 int main()
1158 {
1159 t1();
1160 t2();
1161 t3();
1162 t4();
1163 t5();
1164 t6();
1165 t7();
1166 t8();
1167 t9();
1168 t10();
1169 t11();
1170 t12();
1171 t13();
1172 t14();
1173 t15();
1174 t16();
1175 t17();
1176 t18();
1177 t19();
1178 t20();
1179 t21();
1180 t22();
1181 t23();
1182 t24();
1183 t25();
1184 t26();
1185 t27();
1186 t28();
1187
1188 return 0;
1189 }