1  /* { dg-do run { target openacc_nvidia_accel_selected } } */
       2  /* { dg-additional-options "-lcuda" } */
       3  /* { dg-require-effective-target openacc_cuda } */
       4  
       5  #include <stdio.h>
       6  #include <stdlib.h>
       7  #include <unistd.h>
       8  #include <openacc.h>
       9  #include <cuda.h>
      10  #include "timer.h"
      11  
      12  int
      13  main (int argc, char **argv)
      14  {
      15    CUdevice dev;
      16    CUfunction delay;
      17    CUmodule module;
      18    CUresult r;
      19    int N;
      20    int i;
      21    CUstream *streams;
      22    unsigned long *a, *d_a, dticks;
      23    int nbytes;
      24    float atime, dtime, hitime, lotime;
      25    void *kargs[2];
      26    int clkrate;
      27    int devnum, nprocs;
      28  
      29    acc_init (acc_device_nvidia);
      30  
      31    devnum = acc_get_device_num (acc_device_nvidia);
      32  
      33    r = cuDeviceGet (&dev, devnum);
      34    if (r != CUDA_SUCCESS)
      35      {
      36        fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      37        abort ();
      38      }
      39  
      40    r =
      41      cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
      42  			  dev);
      43    if (r != CUDA_SUCCESS)
      44      {
      45        fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      46        abort ();
      47      }
      48  
      49    r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
      50    if (r != CUDA_SUCCESS)
      51      {
      52        fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      53        abort ();
      54      }
      55  
      56    r = cuModuleLoad (&module, "subr.ptx");
      57    if (r != CUDA_SUCCESS)
      58      {
      59        fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      60        abort ();
      61      }
      62  
      63    r = cuModuleGetFunction (&delay, module, "delay");
      64    if (r != CUDA_SUCCESS)
      65      {
      66        fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      67        abort ();
      68      }
      69  
      70    nbytes = nprocs * sizeof (unsigned long);
      71  
      72    dtime = 200.0;
      73  
      74    dticks = (unsigned long) (dtime * clkrate);
      75  
      76    N = nprocs;
      77  
      78    a = (unsigned long *) malloc (nbytes);
      79    d_a = (unsigned long *) acc_malloc (nbytes);
      80  
      81    acc_map_data (a, d_a, nbytes);
      82  
      83    streams = (CUstream *) malloc (N * sizeof (void *));
      84  
      85    for (i = 0; i < N; i++)
      86      {
      87        streams[i] = (CUstream) acc_get_cuda_stream (i);
      88        if (streams[i] != NULL)
      89  	abort ();
      90  
      91        r = cuStreamCreate (&streams[i], CU_STREAM_DEFAULT);
      92        if (r != CUDA_SUCCESS)
      93  	{
      94  	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      95  	  abort ();
      96  	}
      97  
      98          if (!acc_set_cuda_stream (i, streams[i]))
      99  	  abort ();
     100      }
     101  
     102    init_timers (1);
     103  
     104    kargs[0] = (void *) &d_a;
     105    kargs[1] = (void *) &dticks;
     106  
     107    start_timer (0);
     108  
     109    for (i = 0; i < N; i++)
     110      {
     111        r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, streams[i], kargs, 0);
     112        if (r != CUDA_SUCCESS)
     113  	{
     114  	  fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
     115  	  abort ();
     116  	}
     117  
     118        acc_wait (i);
     119      }
     120  
     121    atime = stop_timer (0);
     122  
     123    hitime = dtime * N;
     124    hitime += hitime * 0.02;
     125  
     126    lotime = dtime * N;
     127    lotime -= lotime * 0.02;
     128  
     129    if (atime > hitime || atime < lotime)
     130      {
     131        fprintf (stderr, "actual time < delay time\n");
     132        abort ();
     133      }
     134  
     135    acc_unmap_data (a);
     136  
     137    fini_timers ();
     138  
     139    free (streams);
     140    free (a);
     141    acc_free (d_a);
     142  
     143    acc_shutdown (acc_device_nvidia);
     144  
     145    exit (0);
     146  }
     147  
     148  /* { dg-output "" } */