(root)/
gcc-13.2.0/
libgomp/
testsuite/
libgomp.oacc-c-c++-common/
reduction-7.c
       1  /* Tests of reduction on loop directive.  */
       2  
       3  /* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
       4     aspects of that functionality.  */
       5  
       6  #include <assert.h>
       7  
       8  
       9  /* Test of reduction on loop directive (gangs, non-private reduction
      10     variable).  */
      11  
      12  void g_np_1()
      13  {
      14    int i, arr[1024], res = 0, hres = 0;
      15  
      16    for (i = 0; i < 1024; i++)
      17      arr[i] = i;
      18  
      19    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
      20    /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
      21    /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
      22    {
      23      #pragma acc loop gang reduction(+:res)
      24      for (i = 0; i < 1024; i++)
      25        res += arr[i];
      26    }
      27  
      28    for (i = 0; i < 1024; i++)
      29      hres += arr[i];
      30  
      31    assert (res == hres);
      32  
      33    res = hres = 1;
      34  
      35    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
      36    /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
      37    /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
      38    {
      39      #pragma acc loop gang reduction(*:res)
      40      for (i = 0; i < 12; i++)
      41        res *= arr[i];
      42    }
      43  
      44    for (i = 0; i < 12; i++)
      45      hres *= arr[i];
      46  
      47    assert (res == hres);
      48  }
      49  
      50  
      51  /* Test of reduction on loop directive (gangs and vectors, non-private
      52     reduction variable).  */
      53  
      54  void gv_np_1()
      55  {
      56    int i, arr[1024], res = 0, hres = 0;
      57  
      58    for (i = 0; i < 1024; i++)
      59      arr[i] = i;
      60  
      61    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
      62    /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-1 } */
      63    {
      64      #pragma acc loop gang vector reduction(+:res)
      65      for (i = 0; i < 1024; i++)
      66        res += arr[i];
      67    }
      68  
      69    for (i = 0; i < 1024; i++)
      70      hres += arr[i];
      71  
      72    assert (res == hres);
      73  }
      74  
      75  
      76  /* Test of reduction on loop directive (gangs and workers, non-private
      77     reduction variable).  */
      78  
      79  void gw_np_1()
      80  {
      81    int i, arr[1024], res = 0, hres = 0;
      82  
      83    for (i = 0; i < 1024; i++)
      84      arr[i] = i;
      85  
      86    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
      87    /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-1 } */
      88    {
      89      #pragma acc loop gang worker reduction(+:res)
      90      for (i = 0; i < 1024; i++)
      91        res += arr[i];
      92    }
      93  
      94    for (i = 0; i < 1024; i++)
      95      hres += arr[i];
      96  
      97    assert (res == hres);
      98  }
      99  
     100  
     101  /* Test of reduction on loop directive (gangs, workers and vectors, non-private
     102     reduction variable).  */
     103  
     104  void gwv_np_1()
     105  {
     106    int i, arr[1024], res = 0, hres = 0;
     107  
     108    for (i = 0; i < 1024; i++)
     109      arr[i] = i;
     110  
     111    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
     112    {
     113      #pragma acc loop gang worker vector reduction(+:res)
     114      for (i = 0; i < 1024; i++)
     115        res += arr[i];
     116    }
     117  
     118    for (i = 0; i < 1024; i++)
     119      hres += arr[i];
     120  
     121    assert (res == hres);
     122  }
     123  
     124  
     125  /* Test of reduction on loop directive (gangs, workers and vectors, non-private
     126     reduction variable: separate gang and worker/vector loops).  */
     127  
     128  void gwv_np_2()
     129  {
     130    int i, j, arr[32768], res = 0, hres = 0;
     131  
     132    for (i = 0; i < 32768; i++)
     133      arr[i] = i;
     134  
     135    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32)
     136    {
     137      #pragma acc loop gang reduction(+:res)
     138      for (j = 0; j < 32; j++)
     139        {
     140          #pragma acc loop worker vector reduction(+:res)
     141          for (i = 0; i < 1024; i++)
     142  	  res += arr[j * 1024 + i];
     143        }
     144      /* "res" is non-private, and is not available until after the parallel
     145         region.  */
     146    }
     147  
     148    for (i = 0; i < 32768; i++)
     149      hres += arr[i];
     150  
     151    assert (res == hres);
     152  }
     153  
     154  
     155  /* Test of reduction on loop directive (gangs, workers and vectors, non-private
     156     reduction variable: separate gang and worker/vector loops).  */
     157  
     158  void gwv_np_3()
     159  {
     160    int i, j;
     161    double arr[32768], res = 0, hres = 0;
     162  
     163    for (i = 0; i < 32768; i++)
     164      arr[i] = i;
     165  
     166    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     167  		       copyin(arr)
     168    {
     169      #pragma acc loop gang reduction(+:res)
     170      for (j = 0; j < 32; j++)
     171        {
     172          #pragma acc loop worker vector reduction(+:res)
     173          for (i = 0; i < 1024; i++)
     174  	  res += arr[j * 1024 + i];
     175        }
     176    }
     177  
     178    for (i = 0; i < 32768; i++)
     179      hres += arr[i];
     180  
     181    assert (res == hres);
     182  }
     183  
     184  #if ACC_DEVICE_TYPE_nvidia
     185  /* To avoid 'libgomp: The Nvidia accelerator has insufficient resources'.  */
     186  #define NUM_WORKERS 28
     187  #else
     188  #define NUM_WORKERS 32
     189  #endif
     190  
     191  /* Test of reduction on loop directive (gangs, workers and vectors, multiple
     192     non-private reduction variables, float type).  */
     193  
     194  void gwv_np_4()
     195  {
     196    int i, j;
     197    float arr[32768];
     198    float res = 0, mres = 0, hres = 0, hmres = 0;
     199  
     200    for (i = 0; i < 32768; i++)
     201      arr[i] = i % (32768 / 64);
     202  
     203    #pragma acc parallel num_gangs(32) num_workers(NUM_WORKERS) vector_length(32)
     204    {
     205      #pragma acc loop gang reduction(+:res) reduction(max:mres)
     206      for (j = 0; j < 32; j++)
     207        {
     208  	#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
     209  	for (i = 0; i < 1024; i++)
     210  	  {
     211  	    res += arr[j * 1024 + i];
     212  	    if (arr[j * 1024 + i] > mres)
     213  	      mres = arr[j * 1024 + i];
     214  	  }
     215  
     216  	#pragma acc loop worker vector reduction(+:res) reduction(max:mres)
     217  	for (i = 0; i < 1024; i++)
     218  	  {
     219  	    res += arr[j * 1024 + (1023 - i)];
     220  	    if (arr[j * 1024 + (1023 - i)] > mres)
     221  	      mres = arr[j * 1024 + (1023 - i)];
     222  	  }
     223        }
     224    }
     225  
     226    for (j = 0; j < 32; j++)
     227      for (i = 0; i < 1024; i++)
     228        {
     229          hres += arr[j * 1024 + i];
     230  	hres += arr[j * 1024 + (1023 - i)];
     231  	if (arr[j * 1024 + i] > hmres)
     232  	  hmres = arr[j * 1024 + i];
     233  	if (arr[j * 1024 + (1023 - i)] > hmres)
     234  	  hmres = arr[j * 1024 + (1023 - i)];
     235        }
     236  
     237    assert (hres <= 16777216);
     238    assert (res == hres);
     239  
     240    assert (hmres <= 16777216);
     241    assert (mres == hmres);
     242  }
     243  
     244  #undef NUM_WORKERS
     245  
     246  /* Test of reduction on loop directive (vectors, private reduction
     247     variable).  */
     248  
     249  void v_p_1()
     250  {
     251    int i, j, arr[1024], out[32], res = 0, hres = 0;
     252  
     253    for (i = 0; i < 1024; i++)
     254      arr[i] = i;
     255  
     256    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     257  		       private(res) copyout(out)
     258    /* { dg-warning "region is worker partitioned but does not contain worker partitioned code" "" { target *-*-* } .-2 } */
     259    {
     260      #pragma acc loop gang
     261      for (j = 0; j < 32; j++)
     262        {
     263          res = 0;
     264  
     265  	#pragma acc loop vector reduction(+:res)
     266  	for (i = 0; i < 32; i++)
     267  	  res += arr[j * 32 + i];
     268  
     269  	out[j] = res;
     270        }
     271    }
     272  
     273    for (j = 0; j < 32; j++)
     274      {
     275        hres = 0;
     276  
     277        for (i = 0; i < 32; i++)
     278  	hres += arr[j * 32 + i];
     279  
     280        assert (out[j] == hres);
     281      }
     282  }
     283  
     284  
     285  /* Test of reduction on loop directive (vector reduction in
     286     gang-partitioned/worker-partitioned mode, private reduction variable).  */
     287  
     288  void v_p_2()
     289  {
     290    int i, j, k;
     291    double ina[1024], inb[1024], out[1024], acc;
     292  
     293    for (j = 0; j < 32; j++)
     294      for (i = 0; i < 32; i++)
     295        {
     296          ina[j * 32 + i] = (i == j) ? 2.0 : 0.0;
     297  	inb[j * 32 + i] = (double) (i + j);
     298        }
     299  
     300    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     301  		       private(acc) copyin(ina, inb) copyout(out)
     302    {
     303      #pragma acc loop gang worker
     304      for (k = 0; k < 32; k++)
     305        for (j = 0; j < 32; j++)
     306          {
     307  	  acc = 0;
     308  
     309  	  #pragma acc loop vector reduction(+:acc)
     310  	  for (i = 0; i < 32; i++)
     311  	    acc += ina[k * 32 + i] * inb[i * 32 + j];
     312  
     313  	  out[k * 32 + j] = acc;
     314  	}
     315    }
     316  
     317    for (j = 0; j < 32; j++)
     318      for (i = 0; i < 32; i++)
     319        assert (out[j * 32 + i] == (i + j) * 2);
     320  }
     321  
     322  
     323  /* Test of reduction on loop directive (workers, private reduction
     324     variable).  */
     325  
     326  void w_p_1()
     327  {
     328    int i, j, arr[1024], out[32], res = 0, hres = 0;
     329  
     330    for (i = 0; i < 1024; i++)
     331      arr[i] = i;
     332  
     333    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     334  		       private(res) copyout(out)
     335    /* { dg-warning "region is vector partitioned but does not contain vector partitioned code" "" { target *-*-* } .-2 } */
     336    {
     337      #pragma acc loop gang
     338      for (j = 0; j < 32; j++)
     339        {
     340          res = 0;
     341  
     342  	#pragma acc loop worker reduction(+:res)
     343  	for (i = 0; i < 32; i++)
     344  	  res += arr[j * 32 + i];
     345  
     346  	out[j] = res;
     347        }
     348    }
     349  
     350    for (j = 0; j < 32; j++)
     351      {
     352        hres = 0;
     353  
     354        for (i = 0; i < 32; i++)
     355  	hres += arr[j * 32 + i];
     356  
     357        assert (out[j] == hres);
     358      }
     359  }
     360  
     361  
     362  /* Test of reduction on loop directive (workers and vectors, private reduction
     363     variable).  */
     364  
     365  void wv_p_1()
     366  {
     367    int i, j, arr[1024], out[32], res = 0, hres = 0;
     368  
     369    for (i = 0; i < 1024; i++)
     370      arr[i] = i;
     371  
     372    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     373  		       private(res) copyout(out)
     374    {
     375      #pragma acc loop gang
     376      for (j = 0; j < 32; j++)
     377        {
     378          res = 0;
     379  
     380  	#pragma acc loop worker vector reduction(+:res)
     381  	for (i = 0; i < 32; i++)
     382  	  res += arr[j * 32 + i];
     383  
     384  	out[j] = res;
     385        }
     386    }
     387  
     388    for (j = 0; j < 32; j++)
     389      {
     390        hres = 0;
     391  
     392        for (i = 0; i < 32; i++)
     393  	hres += arr[j * 32 + i];
     394  
     395        assert (out[j] == hres);
     396      }
     397  }
     398  
     399  
     400  /* Test of reduction on loop directive (workers and vectors, private reduction
     401     variable).  */
     402  
     403  void wv_p_2()
     404  {
     405    int i, j, arr[32768], out[32], res = 0, hres = 0;
     406  
     407    for (i = 0; i < 32768; i++)
     408      arr[i] = i;
     409  
     410    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     411  		       private(res) copyout(out)
     412    {
     413      #pragma acc loop gang
     414      for (j = 0; j < 32; j++)
     415        {
     416          res = j;
     417  
     418  	#pragma acc loop worker reduction(+:res)
     419  	for (i = 0; i < 1024; i++)
     420  	  res += arr[j * 1024 + i];
     421  
     422  	#pragma acc loop vector reduction(+:res)
     423  	for (i = 1023; i >= 0; i--)
     424  	  res += arr[j * 1024 + i];
     425  
     426  	out[j] = res;
     427        }
     428    }
     429  
     430    for (j = 0; j < 32; j++)
     431      {
     432        hres = j;
     433  
     434        for (i = 0; i < 1024; i++)
     435  	hres += arr[j * 1024 + i] * 2;
     436  
     437        assert (out[j] == hres);
     438      }
     439  }
     440  
     441  
     442  /* Test of reduction on loop directive (workers and vectors, private reduction
     443     variable: gang-redundant mode).  */
     444  
     445  void wv_p_3()
     446  {
     447    int i, arr[1024], out[32], res = 0, hres = 0;
     448  
     449    for (i = 0; i < 1024; i++)
     450      arr[i] = i ^ 33;
     451  
     452    #pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) \
     453  		       private(res) copyin(arr) copyout(out)
     454    {
     455      /* Private variables aren't initialized by default in openacc.  */
     456      res = 0;
     457  
     458      /* "res" should be available at the end of the following loop (and should
     459         have the same value redundantly in each gang).  */
     460      #pragma acc loop worker vector reduction(+:res)
     461      for (i = 0; i < 1024; i++)
     462        res += arr[i];
     463  
     464      #pragma acc loop gang (static: 1)
     465      for (i = 0; i < 32; i++)
     466        out[i] = res;
     467    }
     468  
     469    for (i = 0; i < 1024; i++)
     470      hres += arr[i];
     471  
     472    for (i = 0; i < 32; i++)
     473      assert (out[i] == hres);
     474  }
     475  
     476  
     477  int main()
     478  {
     479    g_np_1();
     480    gv_np_1();
     481    gw_np_1();
     482    gwv_np_1();
     483    gwv_np_2();
     484    gwv_np_3();
     485    gwv_np_4();
     486    v_p_1();
     487    v_p_2();
     488    w_p_1();
     489    wv_p_1();
     490    wv_p_2();
     491    wv_p_3();
     492  
     493    return 0;
     494  }