(root)/
gcc-13.2.0/
libgomp/
testsuite/
libgomp.oacc-c-c++-common/
static-variable-1.c
       1  /* "Function scope" (top-level block scope) 'static' variables
       2  
       3     ... inside OpenACC compute construct regions as well as OpenACC 'routine'.
       4  
       5     This is to document/verify aspects of GCC's observed behavior, not
       6     necessarily as it's (intended to be?) restricted by the OpenACC
       7     specification.  See also PR84991, PR84992, PR90779 etc., and
       8     <https://github.com/OpenACC/openacc-spec/issues/372> "C/C++ 'static'
       9     variables" (only visible to members of the GitHub OpenACC organization).
      10  */
      11  
      12  /* { dg-additional-options "-fopt-info-note-omp" }
      13     { dg-additional-options "--param=openacc-privatization=noisy" }
      14     { dg-additional-options "-foffload=-fopt-info-note-omp" }
      15     { dg-additional-options "-foffload=--param=openacc-privatization=noisy" }
      16     for testing/documenting aspects of that functionality.  */
      17  
      18  /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
      19     aspects of that functionality.  */
      20  
      21  
      22  #undef NDEBUG
      23  #include <assert.h>
      24  #include <string.h>
      25  #include <openacc.h>
      26  #include <gomp-constants.h>
      27  
      28  
      29  #define IF_DEBUG if (0)
      30  
      31  
      32  /* Without explicit 'num_gangs'.  */
      33  
      34  static void t0_c(void)
      35  {
      36    IF_DEBUG
      37      __builtin_printf ("%s\n", __FUNCTION__);
      38  
      39    const int i_limit = 11;
      40    const int var_init = 16;
      41  
      42    for (int i = 0; i < i_limit; ++i)
      43      {
      44        int result = 0;
      45        int num_gangs_actual = -1;
      46  #pragma acc parallel \
      47    reduction(max:num_gangs_actual) \
      48    reduction(max:result)
      49        /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-3 } */
      50        {
      51  	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);
      52  
      53  	static int var = var_init;
      54  
      55  #pragma acc atomic capture
      56  	result = ++var;
      57  
      58  	/* Irrespective of the order in which the gang-redundant threads
      59  	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
      60  	   the final value captured as 'result'.  */
      61        }
      62        /* Without an explicit 'num_gangs' clause GCC assigns 'num_gangs(1)'
      63  	 because it doesn't see any use of gang-level parallelism inside the
      64  	 region.  */
      65        assert(num_gangs_actual == 1);
      66        assert(result == var_init + num_gangs_actual * (1 + i));
      67      }
      68  }
      69  
      70  
      71  /* Call a gang-level routine.  */
      72  
      73  static const int t0_r_var_init = 61;
      74  
      75  #pragma acc routine gang
      76  /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
      77  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
      78  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
      79  __attribute__((noinline))
      80  static int t0_r_r(void)
      81  {
      82    static int var = t0_r_var_init;
      83  
      84    int tmp;
      85  #pragma acc atomic capture
      86    tmp = ++var;
      87  
      88    return tmp;
      89  }
      90  
      91  static void t0_r(void)
      92  {
      93    IF_DEBUG
      94      __builtin_printf ("%s\n", __FUNCTION__);
      95  
      96    const int i_limit = 11;
      97  
      98    for (int i = 0; i < i_limit; ++i)
      99      {
     100        int result = 0;
     101        int num_gangs_actual = -1;
     102  #pragma acc parallel \
     103    reduction(max:num_gangs_actual) \
     104    reduction(max:result)
     105        {
     106  	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);
     107  
     108  	result = t0_r_r();
     109  
     110  	/* Irrespective of the order in which the gang-redundant threads
     111  	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
     112  	   the final value captured as 'result'.  */
     113        }
     114        /* The number of gangs selected by the implemention ought to but must not
     115  	 be bigger than one.  */
     116        IF_DEBUG
     117  	__builtin_printf ("%d: num_gangs_actual: %d\n", i, num_gangs_actual);
     118        assert(num_gangs_actual >= 1);
     119        assert(result == t0_r_var_init + num_gangs_actual * (1 + i));
     120      }
     121  }
     122  
     123  
     124  /* Explicit 'num_gangs'.  */
     125  
     126  static void t1_c(void)
     127  {
     128    IF_DEBUG
     129      __builtin_printf ("%s\n", __FUNCTION__);
     130  
     131    const int i_limit = 22;
     132    const int num_gangs_request = 444;
     133    const int var_init = 5;
     134  
     135    for (int i = 0; i < i_limit; ++i)
     136      {
     137        int result = 0;
     138        int num_gangs_actual = -1;
     139        /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
     140  #pragma acc parallel \
     141    num_gangs(num_gangs_request) \
     142    reduction(max:num_gangs_actual) \
     143    reduction(max:result)
     144        /* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
     145        {
     146  	num_gangs_actual = 1 + __builtin_goacc_parlevel_id(GOMP_DIM_GANG);
     147  
     148  	static int var = var_init;
     149  
     150  #pragma acc atomic capture
     151  	result = ++var;
     152  
     153  	/* Irrespective of the order in which the gang-redundant threads
     154  	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
     155  	   the final value captured as 'result'.  */
     156        }
     157        if (acc_get_device_type() == acc_device_host)
     158  	assert(num_gangs_actual == 1);
     159        else
     160  	assert(num_gangs_actual == num_gangs_request);
     161        assert(result == var_init + num_gangs_actual * (1 + i));
     162      }
     163  }
     164  
     165  
     166  /* Check the same routine called from two compute constructs.  */
     167  
     168  static const int t1_r2_var_init = 166;
     169  
     170  #pragma acc routine gang
     171  /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
     172  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
     173  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
     174  __attribute__((noinline))
     175  static int t1_r2_r(void)
     176  {
     177    static int var = t1_r2_var_init;
     178  
     179    int tmp;
     180  #pragma acc atomic capture
     181    tmp = ++var;
     182  
     183    return tmp;
     184  }
     185  
     186  static void t1_r2(void)
     187  {
     188    IF_DEBUG
     189      __builtin_printf ("%s\n", __FUNCTION__);
     190  
     191    const int i_limit = 71;
     192    /* The checking assumes the same 'num_gangs' for all compute constructs.  */
     193    const int num_gangs_request = 333;
     194    int num_gangs_actual = -1;
     195    if (acc_get_device_type() == acc_device_host)
     196      num_gangs_actual = 1;
     197    else
     198      {
     199        /* We're assuming that the implementation is able to accomodate the
     200  	 'num_gangs' requested (which really ought to be true for
     201  	 'num_gangs').  */
     202        num_gangs_actual = num_gangs_request;
     203      }
     204  
     205    for (int i = 0; i < i_limit; ++i)
     206      {
     207        int result_1 = 0;
     208  #pragma acc parallel \
     209    num_gangs(num_gangs_request) \
     210    reduction(max:result_1)
     211        {
     212  	result_1 = t1_r2_r();
     213  
     214  	/* Irrespective of the order in which the gang-redundant threads
     215  	   execute, 'var' has now been incremented 'num_gangs_actual' times, and
     216  	   the final value captured as 'result_1'.  */
     217        }
     218        IF_DEBUG
     219  	__builtin_printf ("%d: result_1: %d\n", i, result_1);
     220        assert(result_1 == t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 0)));
     221  
     222        int result_2 = 0;
     223  #pragma acc parallel \
     224    num_gangs(num_gangs_request) \
     225    reduction(max:result_2)
     226        {
     227  	result_2 = t1_r2_r() + t1_r2_r();
     228  
     229  	/* Irrespective of the order in which the gang-redundant threads
     230  	   execute, 'var' has now been incremented '2 * num_gangs_actual' times.
     231  	   However, the order of the two 't1_r2_r' function calls is not
     232  	   synchronized (between different gang-redundant threads).  We thus
     233  	   cannot verify the actual 'result_2' values in this case.  */
     234        }
     235        IF_DEBUG
     236  	__builtin_printf ("%d: result_2: %d\n", i, result_2);
     237        if (num_gangs_actual == 1)
     238  	/* Per the rationale above, only in this case we can check the actual
     239  	   result.  */
     240  	assert(result_2 == (t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 1))
     241  			    + t1_r2_var_init + num_gangs_actual * (1 + (i * 3 + 2))));
     242        /* But we can generally check low and high limits.  */
     243        {
     244  	/* Must be bigger than '2 * result_1'.  */
     245  	int c = 2 * result_1;
     246  	IF_DEBUG
     247  	  __builtin_printf ("  > %d\n", c);
     248  	assert(result_2 > c);
     249        }
     250        {
     251  	/* ..., but limited by the base value for next 'i'.  */
     252  	int c = 2 * (t1_r2_var_init + num_gangs_actual * (0 + ((i + 1) * 3 + 0)));
     253  	IF_DEBUG
     254  	  __builtin_printf ("  < %d\n", c);
     255  	assert(result_2 < c);
     256        }
     257      }
     258  }
     259  
     260  
     261  /* Asynchronous execution.  */
     262  
     263  static const int t2_var_init_2 = -55;
     264  
     265  #pragma acc routine gang
     266  /* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+4 } */
     267  /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .+3 } */
     268  /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .+2 } */
     269  __attribute__((noinline))
     270  static int t2_r(void)
     271  {
     272    static int var = t2_var_init_2;
     273  
     274    int tmp;
     275  #pragma acc atomic capture
     276    tmp = ++var;
     277  
     278    return tmp;
     279  }
     280  
     281  static void t2(void)
     282  {
     283    IF_DEBUG
     284      __builtin_printf ("%s\n", __FUNCTION__);
     285  
     286    const int i_limit = 12;
     287    const int num_gangs_request_1 = 14;
     288    const int var_init_1 = 5;
     289    int results_1[i_limit][num_gangs_request_1];
     290    memset (results_1, 0, sizeof results_1);
     291    const int num_gangs_request_2 = 5;
     292    int results_2[i_limit][num_gangs_request_2];
     293    memset (results_2, 0, sizeof results_2);
     294    const int num_gangs_request_3 = 34;
     295    const int var_init_3 = 1250;
     296    int results_3[i_limit][num_gangs_request_3];
     297    memset (results_3, 0, sizeof results_3);
     298  
     299  #pragma acc data \
     300    copy(results_1, results_2, results_3)
     301    /* { dg-note {variable 'num_gangs_request_1\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-2 } */
     302    /* { dg-note {variable 'num_gangs_request_2\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-3 } */
     303    /* { dg-note {variable 'num_gangs_request_3\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target { c && { ! __OPTIMIZE__ } } } .-4 } */
     304    /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
     305    {
     306      for (int i = 0; i < i_limit; ++i)
     307        {
     308  	/* The following 'async' clauses effect asynchronous execution, but
     309  	   using the same async-argument for each compute construct implies that
     310  	   the respective compute constructs' execution is synchronized with
     311  	   itself, meaning that all 'i = 0' execution has finished (on the
     312  	   device) before 'i = 1' is started (on the device), etc.  */
     313  
     314  	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
     315  #pragma acc parallel \
     316    present(results_1) \
     317    num_gangs(num_gangs_request_1) \
     318    async(1)
     319  	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
     320  	/* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
     321  	{
     322  	  static int var = var_init_1;
     323  
     324  	  int tmp;
     325  #pragma acc atomic capture
     326  	  tmp = ++var;
     327  
     328  	  results_1[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
     329  	}
     330  
     331  #pragma acc parallel \
     332    present(results_2) \
     333    num_gangs(num_gangs_request_2) \
     334    async(2)
     335  	{
     336  	  results_2[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += t2_r();
     337  	}
     338  
     339  	/* { dg-bogus "warning: region is gang partitioned but does not contain gang partitioned code" "TODO 'atomic'" { xfail *-*-* } .+1 } */
     340  #pragma acc parallel \
     341    present(results_3) \
     342    num_gangs(num_gangs_request_3) \
     343    async(3)
     344  	/* { dg-note {variable 'var' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-4 } */
     345  	/* { dg-note {variable 'tmp' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} "" { target *-*-* } .-5 } */
     346  	{
     347  	  static int var = var_init_3;
     348  
     349  	  int tmp;
     350  #pragma acc atomic capture
     351  	  tmp = ++var;
     352  
     353  	  results_3[i][__builtin_goacc_parlevel_id(GOMP_DIM_GANG)] += tmp;
     354  	}
     355        }
     356  #pragma acc wait
     357    }
     358    int num_gangs_actual_1;
     359    int num_gangs_actual_2;
     360    int num_gangs_actual_3;
     361    if (acc_get_device_type() == acc_device_host)
     362      {
     363        num_gangs_actual_1 = 1;
     364        num_gangs_actual_2 = 1;
     365        num_gangs_actual_3 = 1;
     366      }
     367    else
     368      {
     369        /* We're assuming that the implementation is able to accomodate the
     370  	 'num_gangs' requested (which really ought to be true for
     371  	 'num_gangs').  */
     372        num_gangs_actual_1 = num_gangs_request_1;
     373        num_gangs_actual_2 = num_gangs_request_2;
     374        num_gangs_actual_3 = num_gangs_request_3;
     375      }
     376  
     377    /* For 'i = 0', 'results_*[i][0..num_gangs_actual_*]' are expected to each
     378       contain one value of '(1 + var_init_*)..(var_init_* + num_gangs_actual_*)',
     379       and so on for increasing 'i'.  Their order however is unspecified due to
     380       the gang-redundant execution.  (Thus checking that their sums match.)  */
     381  
     382    int result_1 = 0;
     383    int result_2 = 0;
     384    int result_3 = 0;
     385    for (int i = 0; i < i_limit; ++i)
     386      {
     387        int result_1_ = 0;
     388        for (int g = 0; g < num_gangs_actual_1; ++g)
     389  	{
     390  	  IF_DEBUG
     391  	    __builtin_printf ("results_1[%d][%d]: %d\n", i, g, results_1[i][g]);
     392  	  result_1_ += results_1[i][g];
     393  	}
     394        IF_DEBUG
     395  	__builtin_printf ("%d result_1_: %d\n", i, result_1_);
     396        assert (result_1_ == (((var_init_1 + num_gangs_actual_1 * (1 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (1 + i)) / 2)
     397  			    - ((var_init_1 + num_gangs_actual_1 * (0 + i)) * (1 + var_init_1 + num_gangs_actual_1 * (0 + i)) / 2)));
     398        result_1 += result_1_;
     399  
     400        int result_2_ = 0;
     401        for (int g = 0; g < num_gangs_actual_2; ++g)
     402  	{
     403  	  IF_DEBUG
     404  	    __builtin_printf ("results_2[%d][%d]: %d\n", i, g, results_2[i][g]);
     405  	  result_2_ += results_2[i][g];
     406  	}
     407        IF_DEBUG
     408  	__builtin_printf ("%d result_2_: %d\n", i, result_2_);
     409        assert (result_2_ == (((t2_var_init_2 + num_gangs_actual_2 * (1 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (1 + i)) / 2)
     410  			    - ((t2_var_init_2 + num_gangs_actual_2 * (0 + i)) * (1 + t2_var_init_2 + num_gangs_actual_2 * (0 + i)) / 2)));
     411        result_2 += result_2_;
     412  
     413        int result_3_ = 0;
     414        for (int g = 0; g < num_gangs_actual_3; ++g)
     415  	{
     416  	  IF_DEBUG
     417  	    __builtin_printf ("results_3[%d][%d]: %d\n", i, g, results_3[i][g]);
     418  	  result_3_ += results_3[i][g];
     419  	}
     420        IF_DEBUG
     421  	__builtin_printf ("%d result_3_: %d\n", i, result_3_);
     422        assert (result_3_ == (((var_init_3 + num_gangs_actual_3 * (1 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (1 + i)) / 2)
     423  			    - ((var_init_3 + num_gangs_actual_3 * (0 + i)) * (1 + var_init_3 + num_gangs_actual_3 * (0 + i)) / 2)));
     424        result_3 += result_3_;
     425      }
     426    IF_DEBUG
     427      __builtin_printf ("result_1: %d\n", result_1);
     428    assert (result_1 == (((var_init_1 + num_gangs_actual_1 * i_limit) * (1 + var_init_1 + num_gangs_actual_1 * i_limit) / 2)
     429  		       - (var_init_1 * (var_init_1 + 1) / 2)));
     430    IF_DEBUG
     431      __builtin_printf ("result_2: %d\n", result_2);
     432    assert (result_2 == (((t2_var_init_2 + num_gangs_actual_2 * i_limit) * (1 + t2_var_init_2 + num_gangs_actual_2 * i_limit) / 2)
     433  		       - (t2_var_init_2 * (t2_var_init_2 + 1) / 2)));
     434    IF_DEBUG
     435      __builtin_printf ("result_3: %d\n", result_3);
     436    assert (result_3 == (((var_init_3 + num_gangs_actual_3 * i_limit) * (1 + var_init_3 + num_gangs_actual_3 * i_limit) / 2)
     437  		       - (var_init_3 * (var_init_3 + 1) / 2)));
     438  }
     439  
     440  
     441  #pragma acc routine seq
     442  __attribute__((noinline))
     443  static int pr84991_1_r_s(int n)
     444  {
     445    static const int test[] = {1,2,3,4};
     446    return test[n];
     447  }
     448  
     449  static void pr84991_1(void)
     450  {
     451    int n[1];
     452    n[0] = 3;
     453  #pragma acc parallel copy(n)
     454    {
     455      n[0] = pr84991_1_r_s(n[0]);
     456    }
     457    assert(n[0] == 4);
     458  }
     459  
     460  
     461  static void pr84992_1(void)
     462  {
     463    int n[1];
     464    n[0] = 3;
     465  #pragma acc parallel copy(n)
     466    /* { dg-note {variable 'test' declared in block isn't candidate for adjusting OpenACC privatization level: static} "" { target *-*-* } .-1 } */
     467    {
     468      static const int test[] = {1,2,3,4};
     469      n[0] = test[n[0]];
     470    }
     471    assert(n[0] == 4);
     472  }
     473  
     474  
     475  int main(void)
     476  {
     477    t0_c();
     478  
     479    t0_r();
     480  
     481    t1_c();
     482  
     483    t1_r2();
     484  
     485    t2();
     486  
     487    pr84991_1();
     488  
     489    pr84992_1();
     490  
     491    return 0;
     492  }