1 /* "Function scope" (top-level block scope) 'static' variables
2
3 ... inside OpenACC compute construct regions as well as OpenACC 'routine'.
4
5 This is to document/verify aspects of GCC's observed behavior, not
6 necessarily as it's (intended to be?) restricted by the OpenACC
7 specification. See also PR84991, PR84992, PR90779 etc., and
8 <https://github.com/OpenACC/openacc-spec/issues/372> "C/C++ 'static'
9 variables" (only visible to members of the GitHub OpenACC organization).
10 */
11
12 /* { dg-additional-options "-fopt-info-note-omp" }
13 { dg-additional-options "--param=openacc-privatization=noisy" }
14 { dg-additional-options "-foffload=-fopt-info-note-omp" }
15 { dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
16 for testing/documenting aspects of that functionality. */
17
18 /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
19 aspects of that functionality. */
20
21
22 #undef NDEBUG
23 #include <assert.h>
24 #include <string.h>
25 #include <openacc.h>
26 #include <gomp-constants.h>
27
28
29 #define IF_DEBUG if (0)
30
31
32 /* Without explicit 'num_gangs'. */
33
34 static void t0_c(void)
35 {
36 IF_DEBUG
37 __builtin_printf ("%s\n", __FUNCTION__);
38
39 const int i_limit = 11;
40 const int var_init = 16;
41
42 for (int i = 0; i < i_limit; ++i)
43 {
44 int result = 0;
45 int num_gangs_actual = -1;
46 #pragma acc parallel \
47 reduction(max:num_gangs_actual) \
48 reduction(max:result)
49 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-3 } */
50 {
51 num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);
52
53 static int var = var_init;
54
55 #pragma acc atomic capture
56 result = ++var;
57
58 /* Irrespective of the order in which the gang-redundant threads
59 execute, 'var' has now been incremented 'num_gangs_actual' times, and
60 the final value captured as 'result'. */
61 }
62 /* Without an explicit 'num_gangs' clause GCC assigns 'num_gangs(1)'
63 because it doesn't see any use of gang-level parallelism inside the
64 region. */
65 assert(num_gangs_actual == 1);
66 assert(result == var_init + num_gangs_actual * (1 + i));
67 }
68 }
69
70
71 /* Call a gang-level routine. */
72
73 static const int t0_r_var_init = 61;
74
75 #pragma acc routine gang
76 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
77 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
78 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
79 __attribute__((noinline))
80 static int t0_r_r(void)
81 {
82 static int var = t0_r_var_init;
83
84 int tmp;
85 #pragma acc atomic capture
86 tmp = ++var;
87
88 return tmp;
89 }
90
91 static void t0_r(void)
92 {
93 IF_DEBUG
94 __builtin_printf ("%s\n", __FUNCTION__);
95
96 const int i_limit = 11;
97
98 for (int i = 0; i < i_limit; ++i)
99 {
100 int result = 0;
101 int num_gangs_actual = -1;
102 #pragma acc parallel \
103 reduction(max:num_gangs_actual) \
104 reduction(max:result)
105 {
106 num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);
107
108 result = t0_r_r();
109
110 /* Irrespective of the order in which the gang-redundant threads
111 execute, 'var' has now been incremented 'num_gangs_actual' times, and
112 the final value captured as 'result'. */
113 }
114 /* The number of gangs selected by the implemention ought to but must not
115 be bigger than one. */
116 IF_DEBUG
117 __builtin_printf ("%d: num_gangs_actual: %d\n", i, num_gangs_actual);
118 assert(num_gangs_actual >= 1);
119 assert(result == t0_r_var_init + num_gangs_actual * (1 + i));
120 }
121 }
122
123
124 /* Explicit 'num_gangs'. */
125
126 static void t1_c(void)
127 {
128 IF_DEBUG
129 __builtin_printf ("%s\n", __FUNCTION__);
130
131 const int i_limit = 22;
132 const int num_gangs_request = 444;
133 const int var_init = 5;
134
135 for (int i = 0; i < i_limit; ++i)
136 {
137 int result = 0;
138 int num_gangs_actual = -1;
139 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
140 #pragma acc parallel \
141 num_gangs(num_gangs_request) \
142 reduction(max:num_gangs_actual) \
143 reduction(max:result)
144 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
145 {
146 num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);
147
148 static int var = var_init;
149
150 #pragma acc atomic capture
151 result = ++var;
152
153 /* Irrespective of the order in which the gang-redundant threads
154 execute, 'var' has now been incremented 'num_gangs_actual' times, and
155 the final value captured as 'result'. */
156 }
157 if (acc_get_device_type() == acc_device_host)
158 assert(num_gangs_actual == 1);
159 else
160 assert(num_gangs_actual == num_gangs_request);
161 assert(result == var_init + num_gangs_actual * (1 + i));
162 }
163 }
164
165
166 /* Check the same routine called from two compute constructs. */
167
168 static const int t1_r2_var_init = 166;
169
170 #pragma acc routine gang
171 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
172 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
173 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
174 __attribute__((noinline))
175 static int t1_r2_r(void)
176 {
177 static int var = t1_r2_var_init;
178
179 int tmp;
180 #pragma acc atomic capture
181 tmp = ++var;
182
183 return tmp;
184 }
185
186 static void t1_r2(void)
187 {
188 IF_DEBUG
189 __builtin_printf ("%s\n", __FUNCTION__);
190
191 const int i_limit = 71;
192 /* The checking assumes the same 'num_gangs' for all compute constructs. */
193 const int num_gangs_request = 333;
194 int num_gangs_actual = -1;
195 if (acc_get_device_type() == acc_device_host)
196 num_gangs_actual = 1;
197 else
198 {
199 /* We're assuming that the implementation is able to accomodate the
200 'num_gangs' requested (which really ought to be true for
201 'num_gangs'). */
202 num_gangs_actual = num_gangs_request;
203 }
204
205 for (int i = 0; i < i_limit; ++i)
206 {
207 int result_1 = 0;
208 #pragma acc parallel \
209 num_gangs(num_gangs_request) \
210 reduction(max:result_1)
211 {
212 result_1 = t1_r2_r();
213
214 /* Irrespective of the order in which the gang-redundant threads
215 execute, 'var' has now been incremented 'num_gangs_actual' times, and
216 the final value captured as 'result_1'. */
217 }
218 IF_DEBUG
219 __builtin_printf ("%d: result_1: %d\n", i, result_1);
220 assert(result_1 == t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 0)));
221
222 int result_2 = 0;
223 #pragma acc parallel \
224 num_gangs(num_gangs_request) \
225 reduction(max:result_2)
226 {
227 result_2 = t1_r2_r() + t1_r2_r();
228
229 /* Irrespective of the order in which the gang-redundant threads
230 execute, 'var' has now been incremented '2 * num_gangs_actual' times.
231 However, the order of the two 't1_r2_r' function calls is not
232 synchronized (between different gang-redundant threads). We thus
233 cannot verify the actual 'result_2' values in this case. */
234 }
235 IF_DEBUG
236 __builtin_printf ("%d: result_2: %d\n", i, result_2);
237 if (num_gangs_actual == 1)
238 /* Per the rationale above, only in this case we can check the actual
239 result. */
240 assert(result_2 == (t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 1))
241 + t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 2))));
242 /* But we can generally check low and high limits. */
243 {
244 /* Must be bigger than '2 * result_1'. */
245 int c = 2 * result_1;
246 IF_DEBUG
247 __builtin_printf (" > %d\n", c);
248 assert(result_2 > c);
249 }
250 {
251 /* ..., but limited by the base value for next 'i'. */
252 int c = 2 * (t1_r2_var_init + num_gangs_actual * (0 + ((i + 1) * 3 + 0)));
253 IF_DEBUG
254 __builtin_printf (" < %d\n", c);
255 assert(result_2 < c);
256 }
257 }
258 }
259
260
261 /* Asynchronous execution. */
262
263 static const int t2_var_init_2 = -55;
264
265 #pragma acc routine gang
266 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
267 /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
268 /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
269 __attribute__((noinline))
270 static int t2_r(void)
271 {
272 static int var = t2_var_init_2;
273
274 int tmp;
275 #pragma acc atomic capture
276 tmp = ++var;
277
278 return tmp;
279 }
280
281 static void t2(void)
282 {
283 IF_DEBUG
284 __builtin_printf ("%s\n", __FUNCTION__);
285
286 const int i_limit = 12;
287 const int num_gangs_request_1 = 14;
288 const int var_init_1 = 5;
289 int results_1[i_limit][num_gangs_request_1];
290 memset (results_1, 0, sizeof results_1);
291 const int num_gangs_request_2 = 5;
292 int results_2[i_limit][num_gangs_request_2];
293 memset (results_2, 0, sizeof results_2);
294 const int num_gangs_request_3 = 34;
295 const int var_init_3 = 1250;
296 int results_3[i_limit][num_gangs_request_3];
297 memset (results_3, 0, sizeof results_3);
298
299 #pragma acc data \
300 copy(results_1, results_2, results_3)
301 /* { dg-note {variable 'num_gangs_request_1\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-2 } */
302 /* { dg-note {variable 'num_gangs_request_2\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-3 } */
303 /* { dg-note {variable 'num_gangs_request_3\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-4 } */
304 /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
305 {
306 for (int i = 0; i < i_limit; ++i)
307 {
308 /* The following 'async' clauses effect asynchronous execution, but
309 using the same async-argument for each compute construct implies that
310 the respective compute constructs' execution is synchronized with
311 itself, meaning that all 'i = 0' execution has finished (on the
312 device) before 'i = 1' is started (on the device), etc. */
313
314 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
315 #pragma acc parallel \
316 present(results_1) \
317 num_gangs(num_gangs_request_1) \
318 async(1)
319 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
320 /* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
321 {
322 static int var = var_init_1;
323
324 int tmp;
325 #pragma acc atomic capture
326 tmp = ++var;
327
328 results_1[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
329 }
330
331 #pragma acc parallel \
332 present(results_2) \
333 num_gangs(num_gangs_request_2) \
334 async(2)
335 {
336 results_2[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += t2_r();
337 }
338
339 /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
340 #pragma acc parallel \
341 present(results_3) \
342 num_gangs(num_gangs_request_3) \
343 async(3)
344 /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
345 /* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
346 {
347 static int var = var_init_3;
348
349 int tmp;
350 #pragma acc atomic capture
351 tmp = ++var;
352
353 results_3[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
354 }
355 }
356 #pragma acc wait
357 }
358 int num_gangs_actual_1;
359 int num_gangs_actual_2;
360 int num_gangs_actual_3;
361 if (acc_get_device_type() == acc_device_host)
362 {
363 num_gangs_actual_1 = 1;
364 num_gangs_actual_2 = 1;
365 num_gangs_actual_3 = 1;
366 }
367 else
368 {
369 /* We're assuming that the implementation is able to accomodate the
370 'num_gangs' requested (which really ought to be true for
371 'num_gangs'). */
372 num_gangs_actual_1 = num_gangs_request_1;
373 num_gangs_actual_2 = num_gangs_request_2;
374 num_gangs_actual_3 = num_gangs_request_3;
375 }
376
377 /* For 'i = 0', 'results_*[i][0..num_gangs_actual_*]' are expected to each
378 contain one value of '(1 + var_init_*)..(var_init_* + num_gangs_actual_*)',
379 and so on for increasing 'i'. Their order however is unspecified due to
380 the gang-redundant execution. (Thus checking that their sums match.) */
381
382 int result_1 = 0;
383 int result_2 = 0;
384 int result_3 = 0;
385 for (int i = 0; i < i_limit; ++i)
386 {
387 int result_1_ = 0;
388 for (int g = 0; g < num_gangs_actual_1; ++g)
389 {
390 IF_DEBUG
391 __builtin_printf ("results_1[%d][%d]: %d\n", i, g, results_1[i][g]);
392 result_1_ += results_1[i][g];
393 }
394 IF_DEBUG
395 __builtin_printf ("%d result_1_: %d\n", i, result_1_);
396 assert (result_1_ == (((var_init_1 + num_gangs_actual_1 * (1 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (1 + i)) / 2)
397 - ((var_init_1 + num_gangs_actual_1 * (0 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (0 + i)) / 2)));
398 result_1 += result_1_;
399
400 int result_2_ = 0;
401 for (int g = 0; g < num_gangs_actual_2; ++g)
402 {
403 IF_DEBUG
404 __builtin_printf ("results_2[%d][%d]: %d\n", i, g, results_2[i][g]);
405 result_2_ += results_2[i][g];
406 }
407 IF_DEBUG
408 __builtin_printf ("%d result_2_: %d\n", i, result_2_);
409 assert (result_2_ == (((t2_var_init_2 + num_gangs_actual_2 * (1 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (1 + i)) / 2)
410 - ((t2_var_init_2 + num_gangs_actual_2 * (0 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (0 + i)) / 2)));
411 result_2 += result_2_;
412
413 int result_3_ = 0;
414 for (int g = 0; g < num_gangs_actual_3; ++g)
415 {
416 IF_DEBUG
417 __builtin_printf ("results_3[%d][%d]: %d\n", i, g, results_3[i][g]);
418 result_3_ += results_3[i][g];
419 }
420 IF_DEBUG
421 __builtin_printf ("%d result_3_: %d\n", i, result_3_);
422 assert (result_3_ == (((var_init_3 + num_gangs_actual_3 * (1 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (1 + i)) / 2)
423 - ((var_init_3 + num_gangs_actual_3 * (0 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (0 + i)) / 2)));
424 result_3 += result_3_;
425 }
426 IF_DEBUG
427 __builtin_printf ("result_1: %d\n", result_1);
428 assert (result_1 == (((var_init_1 + num_gangs_actual_1 * i_limit) * (1 + var_init_1 + num_gangs_actual_1 * i_limit) / 2)
429 - (var_init_1 * (var_init_1 + 1) / 2)));
430 IF_DEBUG
431 __builtin_printf ("result_2: %d\n", result_2);
432 assert (result_2 == (((t2_var_init_2 + num_gangs_actual_2 * i_limit) * (1 + t2_var_init_2 + num_gangs_actual_2 * i_limit) / 2)
433 - (t2_var_init_2 * (t2_var_init_2 + 1) / 2)));
434 IF_DEBUG
435 __builtin_printf ("result_3: %d\n", result_3);
436 assert (result_3 == (((var_init_3 + num_gangs_actual_3 * i_limit) * (1 + var_init_3 + num_gangs_actual_3 * i_limit) / 2)
437 - (var_init_3 * (var_init_3 + 1) / 2)));
438 }
439
440
441 #pragma acc routine seq
442 __attribute__((noinline))
443 static int pr84991_1_r_s(int n)
444 {
445 static const int test[] = {1,2,3,4};
446 return test[n];
447 }
448
449 static void pr84991_1(void)
450 {
451 int n[1];
452 n[0] = 3;
453 #pragma acc parallel copy(n)
454 {
455 n[0] = pr84991_1_r_s(n[0]);
456 }
457 assert(n[0] == 4);
458 }
459
460
461 static void pr84992_1(void)
462 {
463 int n[1];
464 n[0] = 3;
465 #pragma acc parallel copy(n)
466 /* { dg-note {variable 'test' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-1 } */
467 {
468 static const int test[] = {1,2,3,4};
469 n[0] = test[n[0]];
470 }
471 assert(n[0] == 4);
472 }
473
474
475 int main(void)
476 {
477 t0_c();
478
479 t0_r();
480
481 t1_c();
482
483 t1_r2();
484
485 t2();
486
487 pr84991_1();
488
489 pr84992_1();
490
491 return 0;
492 }