1  /* Plugin for NVPTX execution.
       2  
       3     Copyright (C) 2013-2023 Free Software Foundation, Inc.
       4  
       5     Contributed by Mentor Embedded.
       6  
       7     This file is part of the GNU Offloading and Multi Processing Library
       8     (libgomp).
       9  
      10     Libgomp is free software; you can redistribute it and/or modify it
      11     under the terms of the GNU General Public License as published by
      12     the Free Software Foundation; either version 3, or (at your option)
      13     any later version.
      14  
      15     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      16     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      17     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      18     more details.
      19  
      20     Under Section 7 of GPL version 3, you are granted additional
      21     permissions described in the GCC Runtime Library Exception, version
      22     3.1, as published by the Free Software Foundation.
      23  
      24     You should have received a copy of the GNU General Public License and
      25     a copy of the GCC Runtime Library Exception along with this program;
      26     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      27     <http://www.gnu.org/licenses/>.  */
      28  
      29  /* Nvidia PTX-specific parts of OpenACC support.  The cuda driver
      30     library appears to hold some implicit state, but the documentation
      31     is not clear as to what that state might be.  Or how one might
      32     propagate it from one thread to another.  */
      33  
      34  #define _GNU_SOURCE
      35  #include "openacc.h"
      36  #include "config.h"
      37  #include "symcat.h"
      38  #include "libgomp-plugin.h"
      39  #include "oacc-plugin.h"
      40  #include "gomp-constants.h"
      41  #include "oacc-int.h"
      42  
      43  /* For struct rev_offload + GOMP_REV_OFFLOAD_VAR. */
      44  #include "config/nvptx/libgomp-nvptx.h"
      45  
      46  #include <pthread.h>
      47  #ifndef PLUGIN_NVPTX_INCLUDE_SYSTEM_CUDA_H
      48  # include "cuda/cuda.h"
      49  #else
      50  # include <cuda.h>
      51  #endif
      52  #include <stdbool.h>
      53  #include <limits.h>
      54  #include <string.h>
      55  #include <stdio.h>
      56  #include <unistd.h>
      57  #include <assert.h>
      58  #include <errno.h>
      59  
      60  /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
      61     block to cache between kernel invocations.  For soft-stacks blocks bigger
      62     than this, we will free the block before attempting another GPU memory
      63     allocation (i.e. in GOMP_OFFLOAD_alloc).  Otherwise, if an allocation fails,
      64     we will free the cached soft-stacks block anyway then retry the
      65     allocation.  If that fails too, we lose.  */
      66  
      67  #define SOFTSTACK_CACHE_LIMIT 134217728
      68  
      69  #if CUDA_VERSION < 6000
      70  extern CUresult cuGetErrorString (CUresult, const char **);
      71  #define CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR 82
      72  #endif
      73  
      74  #if CUDA_VERSION >= 6050
      75  #undef cuLinkCreate
      76  #undef cuLinkAddData
      77  CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
      78  			const char *, unsigned, CUjit_option *, void **);
      79  CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
      80  #else
      81  typedef size_t (*CUoccupancyB2DSize)(int);
      82  CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
      83  			   const char *, unsigned, CUjit_option *, void **);
      84  CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
      85  CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
      86  					  CUoccupancyB2DSize, size_t, int);
      87  #endif
      88  
      89  #define DO_PRAGMA(x) _Pragma (#x)
      90  
      91  #ifndef PLUGIN_NVPTX_LINK_LIBCUDA
      92  # include <dlfcn.h>
      93  
      94  struct cuda_lib_s {
      95  
      96  # define CUDA_ONE_CALL(call)			\
      97    __typeof (call) *call;
      98  # define CUDA_ONE_CALL_MAYBE_NULL(call)		\
      99    CUDA_ONE_CALL (call)
     100  #include "cuda-lib.def"
     101  # undef CUDA_ONE_CALL
     102  # undef CUDA_ONE_CALL_MAYBE_NULL
     103  
     104  } cuda_lib;
     105  
     106  /* -1 if init_cuda_lib has not been called yet, false
     107     if it has been and failed, true if it has been and succeeded.  */
     108  static signed char cuda_lib_inited = -1;
     109  
     110  /* Dynamically load the CUDA runtime library and initialize function
     111     pointers, return false if unsuccessful, true if successful.  */
     112  static bool
     113  init_cuda_lib (void)
     114  {
     115    if (cuda_lib_inited != -1)
     116      return cuda_lib_inited;
     117    const char *cuda_runtime_lib = "libcuda.so.1";
     118    void *h = dlopen (cuda_runtime_lib, RTLD_LAZY);
     119    cuda_lib_inited = false;
     120    if (h == NULL)
     121      return false;
     122  
     123  # define CUDA_ONE_CALL(call) CUDA_ONE_CALL_1 (call, false)
     124  # define CUDA_ONE_CALL_MAYBE_NULL(call) CUDA_ONE_CALL_1 (call, true)
     125  # define CUDA_ONE_CALL_1(call, allow_null)		\
     126    cuda_lib.call = dlsym (h, #call);	\
     127    if (!allow_null && cuda_lib.call == NULL)		\
     128      return false;
     129  #include "cuda-lib.def"
     130  # undef CUDA_ONE_CALL
     131  # undef CUDA_ONE_CALL_1
     132  # undef CUDA_ONE_CALL_MAYBE_NULL
     133  
     134    cuda_lib_inited = true;
     135    return true;
     136  }
     137  # define CUDA_CALL_PREFIX cuda_lib.
     138  #else
     139  
     140  # define CUDA_ONE_CALL(call)
     141  # define CUDA_ONE_CALL_MAYBE_NULL(call) DO_PRAGMA (weak call)
     142  #include "cuda-lib.def"
     143  #undef CUDA_ONE_CALL_MAYBE_NULL
     144  #undef CUDA_ONE_CALL
     145  
     146  # define CUDA_CALL_PREFIX
     147  # define init_cuda_lib() true
     148  #endif
     149  
     150  #include "secure_getenv.h"
     151  
     152  #undef MIN
     153  #undef MAX
     154  #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
     155  #define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
     156  
     157  /* Convenience macros for the frequently used CUDA library call and
     158     error handling sequence as well as CUDA library calls that
     159     do the error checking themselves or don't do it at all.  */
     160  
     161  #define CUDA_CALL_ERET(ERET, FN, ...)		\
     162    do {						\
     163      unsigned __r				\
     164        = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
     165      if (__r != CUDA_SUCCESS)			\
     166        {						\
     167  	GOMP_PLUGIN_error (#FN " error: %s",	\
     168  			   cuda_error (__r));	\
     169  	return ERET;				\
     170        }						\
     171    } while (0)
     172  
     173  #define CUDA_CALL(FN, ...)			\
     174    CUDA_CALL_ERET (false, FN, __VA_ARGS__)
     175  
     176  #define CUDA_CALL_ASSERT(FN, ...)		\
     177    do {						\
     178      unsigned __r				\
     179        = CUDA_CALL_PREFIX FN (__VA_ARGS__);	\
     180      if (__r != CUDA_SUCCESS)			\
     181        {						\
     182  	GOMP_PLUGIN_fatal (#FN " error: %s",	\
     183  			   cuda_error (__r));	\
     184        }						\
     185    } while (0)
     186  
     187  #define CUDA_CALL_NOCHECK(FN, ...)		\
     188    CUDA_CALL_PREFIX FN (__VA_ARGS__)
     189  
     190  #define CUDA_CALL_EXISTS(FN)			\
     191    CUDA_CALL_PREFIX FN
     192  
     193  static const char *
     194  cuda_error (CUresult r)
     195  {
     196    const char *fallback = "unknown cuda error";
     197    const char *desc;
     198  
     199    if (!CUDA_CALL_EXISTS (cuGetErrorString))
     200      return fallback;
     201  
     202    r = CUDA_CALL_NOCHECK (cuGetErrorString, r, &desc);
     203    if (r == CUDA_SUCCESS)
     204      return desc;
     205  
     206    return fallback;
     207  }
     208  
     209  /* Version of the CUDA Toolkit in the same MAJOR.MINOR format that is used by
     210     Nvidia, such as in the 'deviceQuery' program (Nvidia's CUDA samples). */
     211  static char cuda_driver_version_s[30];
     212  
     213  static unsigned int instantiated_devices = 0;
     214  static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
     215  
     216  /* NVPTX/CUDA specific definition of asynchronous queues.  */
     217  struct goacc_asyncqueue
     218  {
     219    CUstream cuda_stream;
     220  };
     221  
     222  struct nvptx_callback
     223  {
     224    void (*fn) (void *);
     225    void *ptr;
     226    struct goacc_asyncqueue *aq;
     227    struct nvptx_callback *next;
     228  };
     229  
     230  /* Thread-specific data for PTX.  */
     231  
     232  struct nvptx_thread
     233  {
     234    /* We currently have this embedded inside the plugin because libgomp manages
     235       devices through integer target_ids.  This might be better if using an
     236       opaque target-specific pointer directly from gomp_device_descr.  */
     237    struct ptx_device *ptx_dev;
     238  };
     239  
     240  /* Target data function launch information.  */
     241  
     242  struct targ_fn_launch
     243  {
     244    const char *fn;
     245    unsigned short dim[GOMP_DIM_MAX];
     246  };
     247  
     248  /* Target PTX object information.  */
     249  
     250  struct targ_ptx_obj
     251  {
     252    const char *code;
     253    size_t size;
     254  };
     255  
     256  /* Target data image information.  */
     257  
     258  typedef struct nvptx_tdata
     259  {
     260    const struct targ_ptx_obj *ptx_objs;
     261    unsigned ptx_num;
     262  
     263    const char *const *var_names;
     264    unsigned var_num;
     265  
     266    const struct targ_fn_launch *fn_descs;
     267    unsigned fn_num;
     268  } nvptx_tdata_t;
     269  
     270  /* Descriptor of a loaded function.  */
     271  
     272  struct targ_fn_descriptor
     273  {
     274    CUfunction fn;
     275    const struct targ_fn_launch *launch;
     276    int regs_per_thread;
     277    int max_threads_per_block;
     278  };
     279  
     280  /* A loaded PTX image.  */
     281  struct ptx_image_data
     282  {
     283    const void *target_data;
     284    CUmodule module;
     285  
     286    struct targ_fn_descriptor *fns;  /* Array of functions.  */
     287    
     288    struct ptx_image_data *next;
     289  };
     290  
     291  struct ptx_free_block
     292  {
     293    void *ptr;
     294    struct ptx_free_block *next;
     295  };
     296  
     297  struct ptx_device
     298  {
     299    CUcontext ctx;
     300    bool ctx_shared;
     301    CUdevice dev;
     302  
     303    int ord;
     304    bool overlap;
     305    bool map;
     306    bool concur;
     307    bool mkern;
     308    int mode;
     309    int clock_khz;
     310    int num_sms;
     311    int regs_per_block;
     312    int regs_per_sm;
     313    int warp_size;
     314    int max_threads_per_block;
     315    int max_threads_per_multiprocessor;
     316    int default_dims[GOMP_DIM_MAX];
     317  
     318    /* Length as used by the CUDA Runtime API ('struct cudaDeviceProp').  */
     319    char name[256];
     320  
     321    struct ptx_image_data *images;  /* Images loaded on device.  */
     322    pthread_mutex_t image_lock;     /* Lock for above list.  */
     323  
     324    struct ptx_free_block *free_blocks;
     325    pthread_mutex_t free_blocks_lock;
     326  
     327    /* OpenMP stacks, cached between kernel invocations.  */
     328    struct
     329      {
     330        CUdeviceptr ptr;
     331        size_t size;
     332        pthread_mutex_t lock;
     333      } omp_stacks;
     334  
     335    struct rev_offload *rev_data;
     336    struct ptx_device *next;
     337  };
     338  
     339  static struct ptx_device **ptx_devices;
     340  
     341  static inline struct nvptx_thread *
     342  nvptx_thread (void)
     343  {
     344    return (struct nvptx_thread *) GOMP_PLUGIN_acc_thread ();
     345  }
     346  
     347  /* Initialize the device.  Return TRUE on success, else FALSE.  PTX_DEV_LOCK
     348     should be locked on entry and remains locked on exit.  */
     349  
     350  static bool
     351  nvptx_init (void)
     352  {
     353    int ndevs;
     354  
     355    if (instantiated_devices != 0)
     356      return true;
     357  
     358    if (!init_cuda_lib ())
     359      return false;
     360  
     361    CUDA_CALL (cuInit, 0);
     362  
     363    int cuda_driver_version;
     364    CUDA_CALL_ERET (NULL, cuDriverGetVersion, &cuda_driver_version);
     365    snprintf (cuda_driver_version_s, sizeof cuda_driver_version_s,
     366  	    "CUDA Driver %u.%u",
     367  	    cuda_driver_version / 1000, cuda_driver_version % 1000 / 10);
     368  
     369    CUDA_CALL (cuDeviceGetCount, &ndevs);
     370    ptx_devices = GOMP_PLUGIN_malloc_cleared (sizeof (struct ptx_device *)
     371  					    * ndevs);
     372  
     373    return true;
     374  }
     375  
     376  /* Select the N'th PTX device for the current host thread.  The device must
     377     have been previously opened before calling this function.  */
     378  
     379  static bool
     380  nvptx_attach_host_thread_to_device (int n)
     381  {
     382    CUdevice dev;
     383    CUresult r;
     384    struct ptx_device *ptx_dev;
     385    CUcontext thd_ctx;
     386  
     387    r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &dev);
     388    if (r == CUDA_ERROR_NOT_PERMITTED)
     389      {
     390        /* Assume we're in a CUDA callback, just return true.  */
     391        return true;
     392      }
     393    if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
     394      {
     395        GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
     396        return false;
     397      }
     398  
     399    if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
     400      return true;
     401    else
     402      {
     403        CUcontext old_ctx;
     404  
     405        ptx_dev = ptx_devices[n];
     406        if (!ptx_dev)
     407  	{
     408  	  GOMP_PLUGIN_error ("device %d not found", n);
     409  	  return false;
     410  	}
     411  
     412        CUDA_CALL (cuCtxGetCurrent, &thd_ctx);
     413  
     414        /* We don't necessarily have a current context (e.g. if it has been
     415           destroyed.  Pop it if we do though.  */
     416        if (thd_ctx != NULL)
     417  	CUDA_CALL (cuCtxPopCurrent, &old_ctx);
     418  
     419        CUDA_CALL (cuCtxPushCurrent, ptx_dev->ctx);
     420      }
     421    return true;
     422  }
     423  
     424  static struct ptx_device *
     425  nvptx_open_device (int n)
     426  {
     427    struct ptx_device *ptx_dev;
     428    CUdevice dev, ctx_dev;
     429    CUresult r;
     430    int pi;
     431  
     432    CUDA_CALL_ERET (NULL, cuDeviceGet, &dev, n);
     433  
     434    ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));
     435  
     436    ptx_dev->ord = n;
     437    ptx_dev->dev = dev;
     438    ptx_dev->ctx_shared = false;
     439  
     440    r = CUDA_CALL_NOCHECK (cuCtxGetDevice, &ctx_dev);
     441    if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
     442      {
     443        GOMP_PLUGIN_error ("cuCtxGetDevice error: %s", cuda_error (r));
     444        return NULL;
     445      }
     446    
     447    if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
     448      {
     449        /* The current host thread has an active context for a different device.
     450           Detach it.  */
     451        CUcontext old_ctx;
     452        CUDA_CALL_ERET (NULL, cuCtxPopCurrent, &old_ctx);
     453      }
     454  
     455    CUDA_CALL_ERET (NULL, cuCtxGetCurrent, &ptx_dev->ctx);
     456  
     457    if (!ptx_dev->ctx)
     458      CUDA_CALL_ERET (NULL, cuCtxCreate, &ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
     459    else
     460      ptx_dev->ctx_shared = true;
     461  
     462    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     463  		  &pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
     464    ptx_dev->overlap = pi;
     465  
     466    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     467  		  &pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
     468    ptx_dev->map = pi;
     469  
     470    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     471  		  &pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
     472    ptx_dev->concur = pi;
     473  
     474    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     475  		  &pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
     476    ptx_dev->mode = pi;
     477  
     478    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     479  		  &pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
     480    ptx_dev->mkern = pi;
     481  
     482    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     483  		  &pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
     484    ptx_dev->clock_khz = pi;
     485  
     486    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     487  		  &pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
     488    ptx_dev->num_sms = pi;
     489  
     490    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     491  		  &pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
     492    ptx_dev->regs_per_block = pi;
     493  
     494    /* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR is defined only
     495       in CUDA 6.0 and newer.  */
     496    r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
     497  			 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
     498  			 dev);
     499    /* Fallback: use limit of registers per block, which is usually equal.  */
     500    if (r == CUDA_ERROR_INVALID_VALUE)
     501      pi = ptx_dev->regs_per_block;
     502    else if (r != CUDA_SUCCESS)
     503      {
     504        GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
     505        return NULL;
     506      }
     507    ptx_dev->regs_per_sm = pi;
     508  
     509    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
     510  		  &pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
     511    if (pi != 32)
     512      {
     513        GOMP_PLUGIN_error ("Only warp size 32 is supported");
     514        return NULL;
     515      }
     516    ptx_dev->warp_size = pi;
     517  
     518    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
     519  		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
     520    ptx_dev->max_threads_per_block = pi;
     521  
     522    CUDA_CALL_ERET (NULL, cuDeviceGetAttribute, &pi,
     523  		  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
     524    ptx_dev->max_threads_per_multiprocessor = pi;
     525  
     526    /* Required below for reverse offload as implemented, but with compute
     527       capability >= 2.0 and 64bit device processes, this should be universally be
     528       the case; hence, an assert.  */
     529    r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
     530  			 CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
     531    assert (r == CUDA_SUCCESS && pi);
     532  
     533    for (int i = 0; i != GOMP_DIM_MAX; i++)
     534      ptx_dev->default_dims[i] = 0;
     535  
     536    CUDA_CALL_ERET (NULL, cuDeviceGetName, ptx_dev->name, sizeof ptx_dev->name,
     537  		  dev);
     538  
     539    ptx_dev->images = NULL;
     540    pthread_mutex_init (&ptx_dev->image_lock, NULL);
     541  
     542    ptx_dev->free_blocks = NULL;
     543    pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
     544  
     545    ptx_dev->omp_stacks.ptr = 0;
     546    ptx_dev->omp_stacks.size = 0;
     547    pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
     548  
     549    ptx_dev->rev_data = NULL;
     550  
     551    return ptx_dev;
     552  }
     553  
     554  static bool
     555  nvptx_close_device (struct ptx_device *ptx_dev)
     556  {
     557    if (!ptx_dev)
     558      return true;
     559  
     560    for (struct ptx_free_block *b = ptx_dev->free_blocks; b;)
     561      {
     562        struct ptx_free_block *b_next = b->next;
     563        CUDA_CALL (cuMemFree, (CUdeviceptr) b->ptr);
     564        free (b);
     565        b = b_next;
     566      }
     567  
     568    pthread_mutex_destroy (&ptx_dev->free_blocks_lock);
     569    pthread_mutex_destroy (&ptx_dev->image_lock);
     570  
     571    pthread_mutex_destroy (&ptx_dev->omp_stacks.lock);
     572  
     573    if (ptx_dev->omp_stacks.ptr)
     574      CUDA_CALL (cuMemFree, ptx_dev->omp_stacks.ptr);
     575  
     576    if (!ptx_dev->ctx_shared)
     577      CUDA_CALL (cuCtxDestroy, ptx_dev->ctx);
     578  
     579    free (ptx_dev);
     580    return true;
     581  }
     582  
     583  static int
     584  nvptx_get_num_devices (void)
     585  {
     586    int n;
     587  
     588    /* This function will be called before the plugin has been initialized in
     589       order to enumerate available devices, but CUDA API routines can't be used
     590       until cuInit has been called.  Just call it now (but don't yet do any
     591       further initialization).  */
     592    if (instantiated_devices == 0)
     593      {
     594        if (!init_cuda_lib ())
     595  	return 0;
     596        CUresult r = CUDA_CALL_NOCHECK (cuInit, 0);
     597        /* This is not an error: e.g. we may have CUDA libraries installed but
     598           no devices available.  */
     599        if (r != CUDA_SUCCESS)
     600  	{
     601  	  GOMP_PLUGIN_debug (0, "Disabling nvptx offloading; cuInit: %s\n",
     602  			     cuda_error (r));
     603  	  return 0;
     604  	}
     605      }
     606  
     607    CUDA_CALL_ERET (-1, cuDeviceGetCount, &n);
     608    return n;
     609  }
     610  
     611  static void
     612  notify_var (const char *var_name, const char *env_var)
     613  {
     614    if (env_var == NULL)
     615      GOMP_PLUGIN_debug (0, "%s: <Not defined>\n", var_name);
     616    else
     617      GOMP_PLUGIN_debug (0, "%s: '%s'\n", var_name, env_var);
     618  }
     619  
     620  static void
     621  process_GOMP_NVPTX_JIT (intptr_t *gomp_nvptx_o)
     622  {
     623    const char *var_name = "GOMP_NVPTX_JIT";
     624    const char *env_var = secure_getenv (var_name);
     625    notify_var (var_name, env_var);
     626  
     627    if (env_var == NULL)
     628      return;
     629  
     630    const char *c = env_var;
     631    while (*c != '\0')
     632      {
     633        while (*c == ' ')
     634  	c++;
     635  
     636        if (c[0] == '-' && c[1] == 'O'
     637  	  && '0' <= c[2] && c[2] <= '4'
     638  	  && (c[3] == '\0' || c[3] == ' '))
     639  	{
     640  	  *gomp_nvptx_o = c[2] - '0';
     641  	  c += 3;
     642  	  continue;
     643  	}
     644  
     645        GOMP_PLUGIN_error ("Error parsing %s", var_name);
     646        break;
     647      }
     648  }
     649  
     650  static bool
     651  link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
     652  	  unsigned num_objs)
     653  {
     654    CUjit_option opts[7];
     655    void *optvals[7];
     656    float elapsed = 0.0;
     657    char elog[1024];
     658    char ilog[16384];
     659    CUlinkState linkstate;
     660    CUresult r;
     661    void *linkout;
     662    size_t linkoutsize __attribute__ ((unused));
     663  
     664    opts[0] = CU_JIT_WALL_TIME;
     665    optvals[0] = &elapsed;
     666  
     667    opts[1] = CU_JIT_INFO_LOG_BUFFER;
     668    optvals[1] = &ilog[0];
     669  
     670    opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
     671    optvals[2] = (void *) sizeof ilog;
     672  
     673    opts[3] = CU_JIT_ERROR_LOG_BUFFER;
     674    optvals[3] = &elog[0];
     675  
     676    opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
     677    optvals[4] = (void *) sizeof elog;
     678  
     679    opts[5] = CU_JIT_LOG_VERBOSE;
     680    optvals[5] = (void *) 1;
     681  
     682    static intptr_t gomp_nvptx_o = -1;
     683  
     684    static bool init_done = false;
     685    if (!init_done)
     686      {
     687        process_GOMP_NVPTX_JIT (&gomp_nvptx_o);
     688        init_done = true;
     689    }
     690  
     691    int nopts = 6;
     692    if (gomp_nvptx_o != -1)
     693      {
     694        opts[nopts] = CU_JIT_OPTIMIZATION_LEVEL;
     695        optvals[nopts] = (void *) gomp_nvptx_o;
     696        nopts++;
     697      }
     698  
     699    if (CUDA_CALL_EXISTS (cuLinkCreate_v2))
     700      CUDA_CALL (cuLinkCreate_v2, nopts, opts, optvals, &linkstate);
     701    else
     702      CUDA_CALL (cuLinkCreate, nopts, opts, optvals, &linkstate);
     703  
     704    for (; num_objs--; ptx_objs++)
     705      {
     706        /* cuLinkAddData's 'data' argument erroneously omits the const
     707  	 qualifier.  */
     708        GOMP_PLUGIN_debug (0, "Loading:\n---\n%s\n---\n", ptx_objs->code);
     709        if (CUDA_CALL_EXISTS (cuLinkAddData_v2))
     710  	r = CUDA_CALL_NOCHECK (cuLinkAddData_v2, linkstate, CU_JIT_INPUT_PTX,
     711  			       (char *) ptx_objs->code, ptx_objs->size,
     712  			       0, 0, 0, 0);
     713        else
     714  	r = CUDA_CALL_NOCHECK (cuLinkAddData, linkstate, CU_JIT_INPUT_PTX,
     715  			       (char *) ptx_objs->code, ptx_objs->size,
     716  			       0, 0, 0, 0);
     717        if (r != CUDA_SUCCESS)
     718  	{
     719  	  GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
     720  	  GOMP_PLUGIN_error ("cuLinkAddData (ptx_code) error: %s",
     721  			     cuda_error (r));
     722  	  return false;
     723  	}
     724      }
     725  
     726    GOMP_PLUGIN_debug (0, "Linking\n");
     727    r = CUDA_CALL_NOCHECK (cuLinkComplete, linkstate, &linkout, &linkoutsize);
     728  
     729    GOMP_PLUGIN_debug (0, "Link complete: %fms\n", elapsed);
     730    GOMP_PLUGIN_debug (0, "Link log %s\n", &ilog[0]);
     731  
     732    if (r != CUDA_SUCCESS)
     733      {
     734        GOMP_PLUGIN_error ("Link error log %s\n", &elog[0]);
     735        GOMP_PLUGIN_error ("cuLinkComplete error: %s", cuda_error (r));
     736        return false;
     737      }
     738  
     739    CUDA_CALL (cuModuleLoadData, module, linkout);
     740    CUDA_CALL (cuLinkDestroy, linkstate);
     741    return true;
     742  }
     743  
     744  static void
     745  nvptx_exec (void (*fn), unsigned *dims, void *targ_mem_desc,
     746  	    CUdeviceptr dp, CUstream stream)
     747  {
     748    struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn;
     749    CUfunction function;
     750    int i;
     751    void *kargs[1];
     752    struct nvptx_thread *nvthd = nvptx_thread ();
     753    int warp_size = nvthd->ptx_dev->warp_size;
     754  
     755    function = targ_fn->fn;
     756  
     757    /* Initialize the launch dimensions.  Typically this is constant,
     758       provided by the device compiler, but we must permit runtime
     759       values.  */
     760    int seen_zero = 0;
     761    for (i = 0; i != GOMP_DIM_MAX; i++)
     762      {
     763        if (targ_fn->launch->dim[i])
     764         dims[i] = targ_fn->launch->dim[i];
     765        if (!dims[i])
     766         seen_zero = 1;
     767      }
     768  
     769    if (seen_zero)
     770      {
     771        pthread_mutex_lock (&ptx_dev_lock);
     772  
     773        static int gomp_openacc_dims[GOMP_DIM_MAX];
     774        if (!gomp_openacc_dims[0])
     775  	{
     776  	  /* See if the user provided GOMP_OPENACC_DIM environment
     777  	     variable to specify runtime defaults.  */
     778  	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
     779  	    gomp_openacc_dims[i] = GOMP_PLUGIN_acc_default_dim (i);
     780  	}
     781  
     782        if (!nvthd->ptx_dev->default_dims[0])
     783  	{
     784  	  int default_dims[GOMP_DIM_MAX];
     785  	  for (int i = 0; i < GOMP_DIM_MAX; ++i)
     786  	    default_dims[i] = gomp_openacc_dims[i];
     787  
     788  	  int gang, worker, vector;
     789  	  {
     790  	    int block_size = nvthd->ptx_dev->max_threads_per_block;
     791  	    int cpu_size = nvthd->ptx_dev->max_threads_per_multiprocessor;
     792  	    int dev_size = nvthd->ptx_dev->num_sms;
     793  	    GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
     794  			       " dev_size=%d, cpu_size=%d\n",
     795  			       warp_size, block_size, dev_size, cpu_size);
     796  
     797  	    gang = (cpu_size / block_size) * dev_size;
     798  	    worker = block_size / warp_size;
     799  	    vector = warp_size;
     800  	  }
     801  
     802  	  /* There is no upper bound on the gang size.  The best size
     803  	     matches the hardware configuration.  Logical gangs are
     804  	     scheduled onto physical hardware.  To maximize usage, we
     805  	     should guess a large number.  */
     806  	  if (default_dims[GOMP_DIM_GANG] < 1)
     807  	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
     808  	  /* The worker size must not exceed the hardware.  */
     809  	  if (default_dims[GOMP_DIM_WORKER] < 1
     810  	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
     811  	    default_dims[GOMP_DIM_WORKER] = worker;
     812  	  /* The vector size must exactly match the hardware.  */
     813  	  if (default_dims[GOMP_DIM_VECTOR] < 1
     814  	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
     815  	    default_dims[GOMP_DIM_VECTOR] = vector;
     816  
     817  	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
     818  			     default_dims[GOMP_DIM_GANG],
     819  			     default_dims[GOMP_DIM_WORKER],
     820  			     default_dims[GOMP_DIM_VECTOR]);
     821  
     822  	  for (i = 0; i != GOMP_DIM_MAX; i++)
     823  	    nvthd->ptx_dev->default_dims[i] = default_dims[i];
     824  	}
     825        pthread_mutex_unlock (&ptx_dev_lock);
     826  
     827        {
     828  	bool default_dim_p[GOMP_DIM_MAX];
     829  	for (i = 0; i != GOMP_DIM_MAX; i++)
     830  	  default_dim_p[i] = !dims[i];
     831  
     832  	if (!CUDA_CALL_EXISTS (cuOccupancyMaxPotentialBlockSize))
     833  	  {
     834  	    for (i = 0; i != GOMP_DIM_MAX; i++)
     835  	      if (default_dim_p[i])
     836  		dims[i] = nvthd->ptx_dev->default_dims[i];
     837  
     838  	    if (default_dim_p[GOMP_DIM_VECTOR])
     839  	      dims[GOMP_DIM_VECTOR]
     840  		= MIN (dims[GOMP_DIM_VECTOR],
     841  		       (targ_fn->max_threads_per_block / warp_size
     842  			* warp_size));
     843  
     844  	    if (default_dim_p[GOMP_DIM_WORKER])
     845  	      dims[GOMP_DIM_WORKER]
     846  		= MIN (dims[GOMP_DIM_WORKER],
     847  		       targ_fn->max_threads_per_block / dims[GOMP_DIM_VECTOR]);
     848  	  }
     849  	else
     850  	  {
     851  	    /* Handle the case that the compiler allows the runtime to choose
     852  	       the vector-length conservatively, by ignoring
     853  	       gomp_openacc_dims[GOMP_DIM_VECTOR].  TODO: actually handle
     854  	       it.  */
     855  	    int vectors = 0;
     856  	    /* TODO: limit gomp_openacc_dims[GOMP_DIM_WORKER] such that that
     857  	       gomp_openacc_dims[GOMP_DIM_WORKER] * actual_vectors does not
     858  	       exceed targ_fn->max_threads_per_block. */
     859  	    int workers = gomp_openacc_dims[GOMP_DIM_WORKER];
     860  	    int gangs = gomp_openacc_dims[GOMP_DIM_GANG];
     861  	    int grids, blocks;
     862  
     863  	    CUDA_CALL_ASSERT (cuOccupancyMaxPotentialBlockSize, &grids,
     864  			      &blocks, function, NULL, 0,
     865  			      dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]);
     866  	    GOMP_PLUGIN_debug (0, "cuOccupancyMaxPotentialBlockSize: "
     867  			       "grid = %d, block = %d\n", grids, blocks);
     868  
     869  	    /* Keep the num_gangs proportional to the block size.  In
     870  	       the case were a block size is limited by shared-memory
     871  	       or the register file capacity, the runtime will not
     872  	       excessively over assign gangs to the multiprocessor
     873  	       units if their state is going to be swapped out even
     874  	       more than necessary. The constant factor 2 is there to
     875  	       prevent threads from idling when there is insufficient
     876  	       work for them.  */
     877  	    if (gangs == 0)
     878  	      gangs = 2 * grids * (blocks / warp_size);
     879  
     880  	    if (vectors == 0)
     881  	      vectors = warp_size;
     882  
     883  	    if (workers == 0)
     884  	      {
     885  		int actual_vectors = (default_dim_p[GOMP_DIM_VECTOR]
     886  				      ? vectors
     887  				      : dims[GOMP_DIM_VECTOR]);
     888  		workers = blocks / actual_vectors;
     889  		workers = MAX (workers, 1);
     890  		/* If we need a per-worker barrier ... .  */
     891  		if (actual_vectors > 32)
     892  		  /* Don't use more barriers than available.  */
     893  		  workers = MIN (workers, 15);
     894  	      }
     895  
     896  	    for (i = 0; i != GOMP_DIM_MAX; i++)
     897  	      if (default_dim_p[i])
     898  		switch (i)
     899  		  {
     900  		  case GOMP_DIM_GANG: dims[i] = gangs; break;
     901  		  case GOMP_DIM_WORKER: dims[i] = workers; break;
     902  		  case GOMP_DIM_VECTOR: dims[i] = vectors; break;
     903  		  default: GOMP_PLUGIN_fatal ("invalid dim");
     904  		  }
     905  	  }
     906        }
     907      }
     908  
     909    /* Check if the accelerator has sufficient hardware resources to
     910       launch the offloaded kernel.  */
     911    if (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR]
     912        > targ_fn->max_threads_per_block)
     913      {
     914        const char *msg
     915  	= ("The Nvidia accelerator has insufficient resources to launch '%s'"
     916  	   " with num_workers = %d and vector_length = %d"
     917  	   "; "
     918  	   "recompile the program with 'num_workers = x and vector_length = y'"
     919  	   " on that offloaded region or '-fopenacc-dim=:x:y' where"
     920  	   " x * y <= %d"
     921  	   ".\n");
     922        GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
     923  			 dims[GOMP_DIM_VECTOR], targ_fn->max_threads_per_block);
     924      }
     925  
     926    /* Check if the accelerator has sufficient barrier resources to
     927       launch the offloaded kernel.  */
     928    if (dims[GOMP_DIM_WORKER] > 15 && dims[GOMP_DIM_VECTOR] > 32)
     929      {
     930        const char *msg
     931  	= ("The Nvidia accelerator has insufficient barrier resources to launch"
     932  	   " '%s' with num_workers = %d and vector_length = %d"
     933  	   "; "
     934  	   "recompile the program with 'num_workers = x' on that offloaded"
     935  	   " region or '-fopenacc-dim=:x:' where x <= 15"
     936  	   "; "
     937  	   "or, recompile the program with 'vector_length = 32' on that"
     938  	   " offloaded region or '-fopenacc-dim=::32'"
     939  	   ".\n");
     940  	GOMP_PLUGIN_fatal (msg, targ_fn->launch->fn, dims[GOMP_DIM_WORKER],
     941  			   dims[GOMP_DIM_VECTOR]);
     942      }
     943  
     944    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
     945  		     " gangs=%u, workers=%u, vectors=%u\n",
     946  		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
     947  		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
     948  
     949    // OpenACC		CUDA
     950    //
     951    // num_gangs		nctaid.x
     952    // num_workers	ntid.y
     953    // vector length	ntid.x
     954  
     955    struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
     956    acc_prof_info *prof_info = thr->prof_info;
     957    acc_event_info enqueue_launch_event_info;
     958    acc_api_info *api_info = thr->api_info;
     959    bool profiling_p = __builtin_expect (prof_info != NULL, false);
     960    if (profiling_p)
     961      {
     962        prof_info->event_type = acc_ev_enqueue_launch_start;
     963  
     964        enqueue_launch_event_info.launch_event.event_type
     965  	= prof_info->event_type;
     966        enqueue_launch_event_info.launch_event.valid_bytes
     967  	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
     968        enqueue_launch_event_info.launch_event.parent_construct
     969  	= acc_construct_parallel;
     970        enqueue_launch_event_info.launch_event.implicit = 1;
     971        enqueue_launch_event_info.launch_event.tool_info = NULL;
     972        enqueue_launch_event_info.launch_event.kernel_name = targ_fn->launch->fn;
     973        enqueue_launch_event_info.launch_event.num_gangs
     974  	= dims[GOMP_DIM_GANG];
     975        enqueue_launch_event_info.launch_event.num_workers
     976  	= dims[GOMP_DIM_WORKER];
     977        enqueue_launch_event_info.launch_event.vector_length
     978  	= dims[GOMP_DIM_VECTOR];
     979  
     980        api_info->device_api = acc_device_api_cuda;
     981  
     982        GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
     983  					    api_info);
     984      }
     985  
     986    kargs[0] = &dp;
     987    CUDA_CALL_ASSERT (cuLaunchKernel, function,
     988  		    dims[GOMP_DIM_GANG], 1, 1,
     989  		    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
     990  		    0, stream, kargs, 0);
     991  
     992    if (profiling_p)
     993      {
     994        prof_info->event_type = acc_ev_enqueue_launch_end;
     995        enqueue_launch_event_info.launch_event.event_type
     996  	= prof_info->event_type;
     997        GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info,
     998  					    api_info);
     999      }
    1000  
    1001    GOMP_PLUGIN_debug (0, "  %s: kernel %s: finished\n", __FUNCTION__,
    1002  		     targ_fn->launch->fn);
    1003  }
    1004  
    1005  void * openacc_get_current_cuda_context (void);
    1006  
    1007  static void
    1008  goacc_profiling_acc_ev_alloc (struct goacc_thread *thr, void *dp, size_t s)
    1009  {
    1010    acc_prof_info *prof_info = thr->prof_info;
    1011    acc_event_info data_event_info;
    1012    acc_api_info *api_info = thr->api_info;
    1013  
    1014    prof_info->event_type = acc_ev_alloc;
    1015  
    1016    data_event_info.data_event.event_type = prof_info->event_type;
    1017    data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
    1018    data_event_info.data_event.parent_construct = acc_construct_parallel;
    1019    data_event_info.data_event.implicit = 1;
    1020    data_event_info.data_event.tool_info = NULL;
    1021    data_event_info.data_event.var_name = NULL;
    1022    data_event_info.data_event.bytes = s;
    1023    data_event_info.data_event.host_ptr = NULL;
    1024    data_event_info.data_event.device_ptr = dp;
    1025  
    1026    api_info->device_api = acc_device_api_cuda;
    1027  
    1028    GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
    1029  }
    1030  
    1031  /* Free the cached soft-stacks block if it is above the SOFTSTACK_CACHE_LIMIT
    1032     size threshold, or if FORCE is true.  */
    1033  
    1034  static void
    1035  nvptx_stacks_free (struct ptx_device *ptx_dev, bool force)
    1036  {
    1037    pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
    1038    if (ptx_dev->omp_stacks.ptr
    1039        && (force || ptx_dev->omp_stacks.size > SOFTSTACK_CACHE_LIMIT))
    1040      {
    1041        CUresult r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
    1042        if (r != CUDA_SUCCESS)
    1043  	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
    1044        ptx_dev->omp_stacks.ptr = 0;
    1045        ptx_dev->omp_stacks.size = 0;
    1046      }
    1047    pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
    1048  }
    1049  
    1050  static void *
    1051  nvptx_alloc (size_t s, bool suppress_errors)
    1052  {
    1053    CUdeviceptr d;
    1054  
    1055    CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s);
    1056    if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY)
    1057      return NULL;
    1058    else if (r != CUDA_SUCCESS)
    1059      {
    1060        GOMP_PLUGIN_error ("nvptx_alloc error: %s", cuda_error (r));
    1061        return NULL;
    1062      }
    1063  
    1064    /* NOTE: We only do profiling stuff if the memory allocation succeeds.  */
    1065    struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
    1066    bool profiling_p
    1067      = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
    1068    if (profiling_p)
    1069      goacc_profiling_acc_ev_alloc (thr, (void *) d, s);
    1070  
    1071    return (void *) d;
    1072  }
    1073  
    1074  static void
    1075  goacc_profiling_acc_ev_free (struct goacc_thread *thr, void *p)
    1076  {
    1077    acc_prof_info *prof_info = thr->prof_info;
    1078    acc_event_info data_event_info;
    1079    acc_api_info *api_info = thr->api_info;
    1080  
    1081    prof_info->event_type = acc_ev_free;
    1082  
    1083    data_event_info.data_event.event_type = prof_info->event_type;
    1084    data_event_info.data_event.valid_bytes = _ACC_DATA_EVENT_INFO_VALID_BYTES;
    1085    data_event_info.data_event.parent_construct = acc_construct_parallel;
    1086    data_event_info.data_event.implicit = 1;
    1087    data_event_info.data_event.tool_info = NULL;
    1088    data_event_info.data_event.var_name = NULL;
    1089    data_event_info.data_event.bytes = -1;
    1090    data_event_info.data_event.host_ptr = NULL;
    1091    data_event_info.data_event.device_ptr = p;
    1092  
    1093    api_info->device_api = acc_device_api_cuda;
    1094  
    1095    GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, api_info);
    1096  }
    1097  
    1098  static bool
    1099  nvptx_free (void *p, struct ptx_device *ptx_dev)
    1100  {
    1101    CUdeviceptr pb;
    1102    size_t ps;
    1103  
    1104    CUresult r = CUDA_CALL_NOCHECK (cuMemGetAddressRange, &pb, &ps,
    1105  				  (CUdeviceptr) p);
    1106    if (r == CUDA_ERROR_NOT_PERMITTED)
    1107      {
    1108        /* We assume that this error indicates we are in a CUDA callback context,
    1109  	 where all CUDA calls are not allowed (see cuStreamAddCallback
    1110  	 documentation for description). Arrange to free this piece of device
    1111  	 memory later.  */
    1112        struct ptx_free_block *n
    1113  	= GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
    1114        n->ptr = p;
    1115        pthread_mutex_lock (&ptx_dev->free_blocks_lock);
    1116        n->next = ptx_dev->free_blocks;
    1117        ptx_dev->free_blocks = n;
    1118        pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
    1119        return true;
    1120      }
    1121    else if (r != CUDA_SUCCESS)
    1122      {
    1123        GOMP_PLUGIN_error ("cuMemGetAddressRange error: %s", cuda_error (r));
    1124        return false;
    1125      }
    1126    if ((CUdeviceptr) p != pb)
    1127      {
    1128        GOMP_PLUGIN_error ("invalid device address");
    1129        return false;
    1130      }
    1131  
    1132    CUDA_CALL (cuMemFree, (CUdeviceptr) p);
    1133    struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
    1134    bool profiling_p
    1135      = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
    1136    if (profiling_p)
    1137      goacc_profiling_acc_ev_free (thr, p);
    1138  
    1139    return true;
    1140  }
    1141  
    1142  static void *
    1143  nvptx_get_current_cuda_device (void)
    1144  {
    1145    struct nvptx_thread *nvthd = nvptx_thread ();
    1146  
    1147    if (!nvthd || !nvthd->ptx_dev)
    1148      return NULL;
    1149  
    1150    return &nvthd->ptx_dev->dev;
    1151  }
    1152  
    1153  static void *
    1154  nvptx_get_current_cuda_context (void)
    1155  {
    1156    struct nvptx_thread *nvthd = nvptx_thread ();
    1157  
    1158    if (!nvthd || !nvthd->ptx_dev)
    1159      return NULL;
    1160  
    1161    return nvthd->ptx_dev->ctx;
    1162  }
    1163  
    1164  /* Plugin entry points.  */
    1165  
    1166  const char *
    1167  GOMP_OFFLOAD_get_name (void)
    1168  {
    1169    return "nvptx";
    1170  }
    1171  
    1172  unsigned int
    1173  GOMP_OFFLOAD_get_caps (void)
    1174  {
    1175    return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
    1176  }
    1177  
    1178  int
    1179  GOMP_OFFLOAD_get_type (void)
    1180  {
    1181    return OFFLOAD_TARGET_TYPE_NVIDIA_PTX;
    1182  }
    1183  
    1184  int
    1185  GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask)
    1186  {
    1187    int num_devices = nvptx_get_num_devices ();
    1188    /* Return -1 if no omp_requires_mask cannot be fulfilled but
    1189       devices were present.  Unified-shared address: see comment in
    1190       nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.  */
    1191    if (num_devices > 0
    1192        && ((omp_requires_mask
    1193  	   & ~(GOMP_REQUIRES_UNIFIED_ADDRESS
    1194  	       | GOMP_REQUIRES_REVERSE_OFFLOAD)) != 0))
    1195      return -1;
    1196    return num_devices;
    1197  }
    1198  
    1199  bool
    1200  GOMP_OFFLOAD_init_device (int n)
    1201  {
    1202    struct ptx_device *dev;
    1203  
    1204    pthread_mutex_lock (&ptx_dev_lock);
    1205  
    1206    if (!nvptx_init () || ptx_devices[n] != NULL)
    1207      {
    1208        pthread_mutex_unlock (&ptx_dev_lock);
    1209        return false;
    1210      }
    1211  
    1212    dev = nvptx_open_device (n);
    1213    if (dev)
    1214      {
    1215        ptx_devices[n] = dev;
    1216        instantiated_devices++;
    1217      }
    1218  
    1219    pthread_mutex_unlock (&ptx_dev_lock);
    1220  
    1221    return dev != NULL;
    1222  }
    1223  
    1224  bool
    1225  GOMP_OFFLOAD_fini_device (int n)
    1226  {
    1227    pthread_mutex_lock (&ptx_dev_lock);
    1228  
    1229    if (ptx_devices[n] != NULL)
    1230      {
    1231        if (!nvptx_attach_host_thread_to_device (n)
    1232  	  || !nvptx_close_device (ptx_devices[n]))
    1233  	{
    1234  	  pthread_mutex_unlock (&ptx_dev_lock);
    1235  	  return false;
    1236  	}
    1237        ptx_devices[n] = NULL;
    1238        instantiated_devices--;
    1239      }
    1240  
    1241    if (instantiated_devices == 0)
    1242      {
    1243        free (ptx_devices);
    1244        ptx_devices = NULL;
    1245      }
    1246  
    1247    pthread_mutex_unlock (&ptx_dev_lock);
    1248    return true;
    1249  }
    1250  
    1251  /* Return the libgomp version number we're compatible with.  There is
    1252     no requirement for cross-version compatibility.  */
    1253  
    1254  unsigned
    1255  GOMP_OFFLOAD_version (void)
    1256  {
    1257    return GOMP_VERSION;
    1258  }
    1259  
    1260  /* Initialize __nvptx_clocktick, if present in MODULE.  */
    1261  
    1262  static void
    1263  nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
    1264  {
    1265    CUdeviceptr dptr;
    1266    CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &dptr, NULL,
    1267  				  module, "__nvptx_clocktick");
    1268    if (r == CUDA_ERROR_NOT_FOUND)
    1269      return;
    1270    if (r != CUDA_SUCCESS)
    1271      GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
    1272    double __nvptx_clocktick = 1e-3 / dev->clock_khz;
    1273    r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, dptr, &__nvptx_clocktick,
    1274  			 sizeof (__nvptx_clocktick));
    1275    if (r != CUDA_SUCCESS)
    1276      GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
    1277  }
    1278  
    1279  /* Load the (partial) program described by TARGET_DATA to device
    1280     number ORD.  Allocate and return TARGET_TABLE.  If not NULL, REV_FN_TABLE
    1281     will contain the on-device addresses of the functions for reverse offload.
    1282     To be freed by the caller.  */
    1283  
    1284  int
    1285  GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
    1286  			 struct addr_pair **target_table,
    1287  			 uint64_t **rev_fn_table)
    1288  {
    1289    CUmodule module;
    1290    const char *const *var_names;
    1291    const struct targ_fn_launch *fn_descs;
    1292    unsigned int fn_entries, var_entries, other_entries, i, j;
    1293    struct targ_fn_descriptor *targ_fns;
    1294    struct addr_pair *targ_tbl;
    1295    const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
    1296    struct ptx_image_data *new_image;
    1297    struct ptx_device *dev;
    1298  
    1299    if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
    1300      {
    1301        GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
    1302  			 " (expected %u, received %u)",
    1303  			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
    1304        return -1;
    1305      }
    1306  
    1307    if (!nvptx_attach_host_thread_to_device (ord)
    1308        || !link_ptx (&module, img_header->ptx_objs, img_header->ptx_num))
    1309      return -1;
    1310  
    1311    dev = ptx_devices[ord];
    1312  
    1313    /* The mkoffload utility emits a struct of pointers/integers at the
    1314       start of each offload image.  The array of kernel names and the
    1315       functions addresses form a one-to-one correspondence.  */
    1316  
    1317    var_entries = img_header->var_num;
    1318    var_names = img_header->var_names;
    1319    fn_entries = img_header->fn_num;
    1320    fn_descs = img_header->fn_descs;
    1321  
    1322    /* Currently, other_entries contains only the struct of ICVs.  */
    1323    other_entries = 1;
    1324  
    1325    targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
    1326  				 * (fn_entries + var_entries + other_entries));
    1327    targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
    1328  				 * fn_entries);
    1329  
    1330    *target_table = targ_tbl;
    1331  
    1332    new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
    1333    new_image->target_data = target_data;
    1334    new_image->module = module;
    1335    new_image->fns = targ_fns;
    1336  
    1337    pthread_mutex_lock (&dev->image_lock);
    1338    new_image->next = dev->images;
    1339    dev->images = new_image;
    1340    pthread_mutex_unlock (&dev->image_lock);
    1341  
    1342    for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
    1343      {
    1344        CUfunction function;
    1345        int nregs, mthrs;
    1346  
    1347        CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
    1348  		      fn_descs[i].fn);
    1349        CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
    1350  		      CU_FUNC_ATTRIBUTE_NUM_REGS, function);
    1351        CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
    1352  		      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
    1353  
    1354        targ_fns->fn = function;
    1355        targ_fns->launch = &fn_descs[i];
    1356        targ_fns->regs_per_thread = nregs;
    1357        targ_fns->max_threads_per_block = mthrs;
    1358  
    1359        targ_tbl->start = (uintptr_t) targ_fns;
    1360        targ_tbl->end = targ_tbl->start + 1;
    1361      }
    1362  
    1363    for (j = 0; j < var_entries; j++, targ_tbl++)
    1364      {
    1365        CUdeviceptr var;
    1366        size_t bytes;
    1367  
    1368        CUDA_CALL_ERET (-1, cuModuleGetGlobal,
    1369  		      &var, &bytes, module, var_names[j]);
    1370  
    1371        targ_tbl->start = (uintptr_t) var;
    1372        targ_tbl->end = targ_tbl->start + bytes;
    1373      }
    1374  
    1375    CUdeviceptr varptr;
    1376    size_t varsize;
    1377    CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &varptr, &varsize,
    1378  				  module, XSTRING (GOMP_ADDITIONAL_ICVS));
    1379  
    1380    if (r == CUDA_SUCCESS)
    1381      {
    1382        targ_tbl->start = (uintptr_t) varptr;
    1383        targ_tbl->end = (uintptr_t) (varptr + varsize);
    1384      }
    1385    else
    1386      /* The variable was not in this image.  */
    1387      targ_tbl->start = targ_tbl->end = 0;
    1388  
    1389    if (rev_fn_table && fn_entries == 0)
    1390      *rev_fn_table = NULL;
    1391    else if (rev_fn_table)
    1392      {
    1393        CUdeviceptr var;
    1394        size_t bytes;
    1395        unsigned int i;
    1396        r = CUDA_CALL_NOCHECK (cuModuleGetGlobal, &var, &bytes, module,
    1397  			     "$offload_func_table");
    1398        if (r != CUDA_SUCCESS)
    1399  	GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
    1400        assert (bytes == sizeof (uint64_t) * fn_entries);
    1401        *rev_fn_table = GOMP_PLUGIN_malloc (sizeof (uint64_t) * fn_entries);
    1402        r = CUDA_CALL_NOCHECK (cuMemcpyDtoH, *rev_fn_table, var, bytes);
    1403        if (r != CUDA_SUCCESS)
    1404  	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
    1405        /* Free if only NULL entries.  */
    1406        for (i = 0; i < fn_entries; ++i)
    1407  	if ((*rev_fn_table)[i] != 0)
    1408  	  break;
    1409        if (i == fn_entries)
    1410  	{
    1411  	  free (*rev_fn_table);
    1412  	  *rev_fn_table = NULL;
    1413  	}
    1414      }
    1415  
    1416    if (rev_fn_table && *rev_fn_table && dev->rev_data == NULL)
    1417      {
    1418        /* Get the on-device GOMP_REV_OFFLOAD_VAR variable.  It should be
    1419  	 available but it might be not.  One reason could be: if the user code
    1420  	 has 'omp target device(ancestor:1)' in pure hostcode, GOMP_target_ext
    1421  	 is not called on the device and, hence, it and GOMP_REV_OFFLOAD_VAR
    1422  	 are not linked in.  */
    1423        CUdeviceptr device_rev_offload_var;
    1424        size_t device_rev_offload_size;
    1425        CUresult r = CUDA_CALL_NOCHECK (cuModuleGetGlobal,
    1426  				      &device_rev_offload_var,
    1427  				      &device_rev_offload_size, module,
    1428  				      XSTRING (GOMP_REV_OFFLOAD_VAR));
    1429        if (r != CUDA_SUCCESS)
    1430  	{
    1431  	  free (*rev_fn_table);
    1432  	  *rev_fn_table = NULL;
    1433  	}
    1434        else
    1435  	{
    1436  	  /* cuMemHostAlloc memory is accessible on the device, if
    1437  	     unified-shared address is supported; this is assumed - see comment
    1438  	     in nvptx_open_device for CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. */
    1439  	  CUDA_CALL_ASSERT (cuMemHostAlloc, (void **) &dev->rev_data,
    1440  			    sizeof (*dev->rev_data), CU_MEMHOSTALLOC_DEVICEMAP);
    1441  	  CUdeviceptr dp = (CUdeviceptr) dev->rev_data;
    1442  	  r = CUDA_CALL_NOCHECK (cuMemcpyHtoD, device_rev_offload_var, &dp,
    1443  				 sizeof (dp));
    1444  	  if (r != CUDA_SUCCESS)
    1445  	    GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
    1446  	}
    1447      }
    1448  
    1449    nvptx_set_clocktick (module, dev);
    1450  
    1451    return fn_entries + var_entries + other_entries;
    1452  }
    1453  
    1454  /* Unload the program described by TARGET_DATA.  DEV_DATA is the
    1455     function descriptors allocated by G_O_load_image.  */
    1456  
    1457  bool
    1458  GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
    1459  {
    1460    struct ptx_image_data *image, **prev_p;
    1461    struct ptx_device *dev = ptx_devices[ord];
    1462  
    1463    if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
    1464      {
    1465        GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
    1466  			 " (expected %u, received %u)",
    1467  			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
    1468        return false;
    1469      }
    1470  
    1471    bool ret = true;
    1472    pthread_mutex_lock (&dev->image_lock);
    1473    for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
    1474      if (image->target_data == target_data)
    1475        {
    1476  	*prev_p = image->next;
    1477  	if (CUDA_CALL_NOCHECK (cuModuleUnload, image->module) != CUDA_SUCCESS)
    1478  	  ret = false;
    1479  	free (image->fns);
    1480  	free (image);
    1481  	break;
    1482        }
    1483    pthread_mutex_unlock (&dev->image_lock);
    1484    return ret;
    1485  }
    1486  
    1487  void *
    1488  GOMP_OFFLOAD_alloc (int ord, size_t size)
    1489  {
    1490    if (!nvptx_attach_host_thread_to_device (ord))
    1491      return NULL;
    1492  
    1493    struct ptx_device *ptx_dev = ptx_devices[ord];
    1494    struct ptx_free_block *blocks, *tmp;
    1495  
    1496    pthread_mutex_lock (&ptx_dev->free_blocks_lock);
    1497    blocks = ptx_dev->free_blocks;
    1498    ptx_dev->free_blocks = NULL;
    1499    pthread_mutex_unlock (&ptx_dev->free_blocks_lock);
    1500  
    1501    nvptx_stacks_free (ptx_dev, false);
    1502  
    1503    while (blocks)
    1504      {
    1505        tmp = blocks->next;
    1506        nvptx_free (blocks->ptr, ptx_dev);
    1507        free (blocks);
    1508        blocks = tmp;
    1509      }
    1510  
    1511    void *d = nvptx_alloc (size, true);
    1512    if (d)
    1513      return d;
    1514    else
    1515      {
    1516        /* Memory allocation failed.  Try freeing the stacks block, and
    1517  	 retrying.  */
    1518        nvptx_stacks_free (ptx_dev, true);
    1519        return nvptx_alloc (size, false);
    1520      }
    1521  }
    1522  
    1523  bool
    1524  GOMP_OFFLOAD_free (int ord, void *ptr)
    1525  {
    1526    return (nvptx_attach_host_thread_to_device (ord)
    1527  	  && nvptx_free (ptr, ptx_devices[ord]));
    1528  }
    1529  
    1530  void
    1531  GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
    1532  			   size_t mapnum  __attribute__((unused)),
    1533  			   void **hostaddrs __attribute__((unused)),
    1534  			   void **devaddrs,
    1535  			   unsigned *dims, void *targ_mem_desc)
    1536  {
    1537    GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
    1538  
    1539    CUdeviceptr dp = (CUdeviceptr) devaddrs;
    1540    nvptx_exec (fn, dims, targ_mem_desc, dp, NULL);
    1541  
    1542    CUresult r = CUDA_CALL_NOCHECK (cuStreamSynchronize, NULL);
    1543    const char *maybe_abort_msg = "(perhaps abort was called)";
    1544    if (r == CUDA_ERROR_LAUNCH_FAILED)
    1545      GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r),
    1546  		       maybe_abort_msg);
    1547    else if (r != CUDA_SUCCESS)
    1548      GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r));
    1549  }
    1550  
    1551  void
    1552  GOMP_OFFLOAD_openacc_async_exec (void (*fn) (void *),
    1553  				 size_t mapnum __attribute__((unused)),
    1554  				 void **hostaddrs __attribute__((unused)),
    1555  				 void **devaddrs,
    1556  				 unsigned *dims, void *targ_mem_desc,
    1557  				 struct goacc_asyncqueue *aq)
    1558  {
    1559    GOMP_PLUGIN_debug (0, "nvptx %s\n", __FUNCTION__);
    1560  
    1561    CUdeviceptr dp = (CUdeviceptr) devaddrs;
    1562    nvptx_exec (fn, dims, targ_mem_desc, dp, aq->cuda_stream);
    1563  }
    1564  
    1565  void *
    1566  GOMP_OFFLOAD_openacc_create_thread_data (int ord)
    1567  {
    1568    struct ptx_device *ptx_dev;
    1569    struct nvptx_thread *nvthd
    1570      = GOMP_PLUGIN_malloc (sizeof (struct nvptx_thread));
    1571    CUcontext thd_ctx;
    1572  
    1573    ptx_dev = ptx_devices[ord];
    1574  
    1575    assert (ptx_dev);
    1576  
    1577    CUDA_CALL_ASSERT (cuCtxGetCurrent, &thd_ctx);
    1578  
    1579    assert (ptx_dev->ctx);
    1580  
    1581    if (!thd_ctx)
    1582      CUDA_CALL_ASSERT (cuCtxPushCurrent, ptx_dev->ctx);
    1583  
    1584    nvthd->ptx_dev = ptx_dev;
    1585  
    1586    return (void *) nvthd;
    1587  }
    1588  
    1589  void
    1590  GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
    1591  {
    1592    free (data);
    1593  }
    1594  
    1595  void *
    1596  GOMP_OFFLOAD_openacc_cuda_get_current_device (void)
    1597  {
    1598    return nvptx_get_current_cuda_device ();
    1599  }
    1600  
    1601  void *
    1602  GOMP_OFFLOAD_openacc_cuda_get_current_context (void)
    1603  {
    1604    return nvptx_get_current_cuda_context ();
    1605  }
    1606  
    1607  /* This returns a CUstream.  */
    1608  void *
    1609  GOMP_OFFLOAD_openacc_cuda_get_stream (struct goacc_asyncqueue *aq)
    1610  {
    1611    return (void *) aq->cuda_stream;
    1612  }
    1613  
    1614  /* This takes a CUstream.  */
    1615  int
    1616  GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
    1617  {
    1618    if (aq->cuda_stream)
    1619      {
    1620        CUDA_CALL_ASSERT (cuStreamSynchronize, aq->cuda_stream);
    1621        CUDA_CALL_ASSERT (cuStreamDestroy, aq->cuda_stream);
    1622      }
    1623  
    1624    aq->cuda_stream = (CUstream) stream;
    1625    return 1;
    1626  }
    1627  
    1628  struct goacc_asyncqueue *
    1629  GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
    1630  {
    1631    CUstream stream = NULL;
    1632    CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
    1633  
    1634    struct goacc_asyncqueue *aq
    1635      = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
    1636    aq->cuda_stream = stream;
    1637    return aq;
    1638  }
    1639  
    1640  bool
    1641  GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
    1642  {
    1643    CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
    1644    free (aq);
    1645    return true;
    1646  }
    1647  
    1648  int
    1649  GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
    1650  {
    1651    CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
    1652    if (r == CUDA_SUCCESS)
    1653      return 1;
    1654    if (r == CUDA_ERROR_NOT_READY)
    1655      return 0;
    1656  
    1657    GOMP_PLUGIN_error ("cuStreamQuery error: %s", cuda_error (r));
    1658    return -1;
    1659  }
    1660  
    1661  bool
    1662  GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
    1663  {
    1664    CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
    1665    return true;
    1666  }
    1667  
    1668  bool
    1669  GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
    1670  				      struct goacc_asyncqueue *aq2)
    1671  {
    1672    CUevent e;
    1673    CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
    1674    CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
    1675    CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
    1676    return true;
    1677  }
    1678  
    1679  static void
    1680  cuda_callback_wrapper (CUstream stream, CUresult res, void *ptr)
    1681  {
    1682    if (res != CUDA_SUCCESS)
    1683      GOMP_PLUGIN_fatal ("%s error: %s", __FUNCTION__, cuda_error (res));
    1684    struct nvptx_callback *cb = (struct nvptx_callback *) ptr;
    1685    cb->fn (cb->ptr);
    1686    free (ptr);
    1687  }
    1688  
    1689  void
    1690  GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
    1691  					   void (*callback_fn)(void *),
    1692  					   void *userptr)
    1693  {
    1694    struct nvptx_callback *b = GOMP_PLUGIN_malloc (sizeof (*b));
    1695    b->fn = callback_fn;
    1696    b->ptr = userptr;
    1697    b->aq = aq;
    1698    CUDA_CALL_ASSERT (cuStreamAddCallback, aq->cuda_stream,
    1699  		    cuda_callback_wrapper, (void *) b, 0);
    1700  }
    1701  
    1702  static bool
    1703  cuda_memcpy_sanity_check (const void *h, const void *d, size_t s)
    1704  {
    1705    CUdeviceptr pb;
    1706    size_t ps;
    1707    if (!s)
    1708      return true;
    1709    if (!d)
    1710      {
    1711        GOMP_PLUGIN_error ("invalid device address");
    1712        return false;
    1713      }
    1714    CUDA_CALL (cuMemGetAddressRange, &pb, &ps, (CUdeviceptr) d);
    1715    if (!pb)
    1716      {
    1717        GOMP_PLUGIN_error ("invalid device address");
    1718        return false;
    1719      }
    1720    if (!h)
    1721      {
    1722        GOMP_PLUGIN_error ("invalid host address");
    1723        return false;
    1724      }
    1725    if (d == h)
    1726      {
    1727        GOMP_PLUGIN_error ("invalid host or device address");
    1728        return false;
    1729      }
    1730    if ((void *)(d + s) > (void *)(pb + ps))
    1731      {
    1732        GOMP_PLUGIN_error ("invalid size");
    1733        return false;
    1734      }
    1735    return true;
    1736  }
    1737  
    1738  bool
    1739  GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
    1740  {
    1741    if (!nvptx_attach_host_thread_to_device (ord)
    1742        || !cuda_memcpy_sanity_check (src, dst, n))
    1743      return false;
    1744    CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) dst, src, n);
    1745    return true;
    1746  }
    1747  
    1748  bool
    1749  GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
    1750  {
    1751    if (!nvptx_attach_host_thread_to_device (ord)
    1752        || !cuda_memcpy_sanity_check (dst, src, n))
    1753      return false;
    1754    CUDA_CALL (cuMemcpyDtoH, dst, (CUdeviceptr) src, n);
    1755    return true;
    1756  }
    1757  
    1758  bool
    1759  GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
    1760  {
    1761    CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n, NULL);
    1762    return true;
    1763  }
    1764  
    1765  bool
    1766  GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src,
    1767  				     size_t n, struct goacc_asyncqueue *aq)
    1768  {
    1769    if (!nvptx_attach_host_thread_to_device (ord)
    1770        || !cuda_memcpy_sanity_check (src, dst, n))
    1771      return false;
    1772    CUDA_CALL (cuMemcpyHtoDAsync, (CUdeviceptr) dst, src, n, aq->cuda_stream);
    1773    return true;
    1774  }
    1775  
    1776  bool
    1777  GOMP_OFFLOAD_openacc_async_dev2host (int ord, void *dst, const void *src,
    1778  				     size_t n, struct goacc_asyncqueue *aq)
    1779  {
    1780    if (!nvptx_attach_host_thread_to_device (ord)
    1781        || !cuda_memcpy_sanity_check (dst, src, n))
    1782      return false;
    1783    CUDA_CALL (cuMemcpyDtoHAsync, dst, (CUdeviceptr) src, n, aq->cuda_stream);
    1784    return true;
    1785  }
    1786  
    1787  union goacc_property_value
    1788  GOMP_OFFLOAD_openacc_get_property (int n, enum goacc_property prop)
    1789  {
    1790    union goacc_property_value propval = { .val = 0 };
    1791  
    1792    pthread_mutex_lock (&ptx_dev_lock);
    1793  
    1794    if (n >= nvptx_get_num_devices () || n < 0 || ptx_devices[n] == NULL)
    1795      {
    1796        pthread_mutex_unlock (&ptx_dev_lock);
    1797        return propval;
    1798      }
    1799  
    1800    struct ptx_device *ptx_dev = ptx_devices[n];
    1801    switch (prop)
    1802      {
    1803      case GOACC_PROPERTY_MEMORY:
    1804        {
    1805  	size_t total_mem;
    1806  
    1807  	CUDA_CALL_ERET (propval, cuDeviceTotalMem, &total_mem, ptx_dev->dev);
    1808  	propval.val = total_mem;
    1809        }
    1810        break;
    1811      case GOACC_PROPERTY_FREE_MEMORY:
    1812        {
    1813  	size_t total_mem;
    1814  	size_t free_mem;
    1815  	CUdevice ctxdev;
    1816  
    1817  	CUDA_CALL_ERET (propval, cuCtxGetDevice, &ctxdev);
    1818  	if (ptx_dev->dev == ctxdev)
    1819  	  CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
    1820  	else if (ptx_dev->ctx)
    1821  	  {
    1822  	    CUcontext old_ctx;
    1823  
    1824  	    CUDA_CALL_ERET (propval, cuCtxPushCurrent, ptx_dev->ctx);
    1825  	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
    1826  	    CUDA_CALL_ASSERT (cuCtxPopCurrent, &old_ctx);
    1827  	  }
    1828  	else
    1829  	  {
    1830  	    CUcontext new_ctx;
    1831  
    1832  	    CUDA_CALL_ERET (propval, cuCtxCreate, &new_ctx, CU_CTX_SCHED_AUTO,
    1833  			    ptx_dev->dev);
    1834  	    CUDA_CALL_ERET (propval, cuMemGetInfo, &free_mem, &total_mem);
    1835  	    CUDA_CALL_ASSERT (cuCtxDestroy, new_ctx);
    1836  	  }
    1837  	propval.val = free_mem;
    1838        }
    1839        break;
    1840      case GOACC_PROPERTY_NAME:
    1841        propval.ptr = ptx_dev->name;
    1842        break;
    1843      case GOACC_PROPERTY_VENDOR:
    1844        propval.ptr = "Nvidia";
    1845        break;
    1846      case GOACC_PROPERTY_DRIVER:
    1847        propval.ptr = cuda_driver_version_s;
    1848        break;
    1849      default:
    1850        break;
    1851      }
    1852  
    1853    pthread_mutex_unlock (&ptx_dev_lock);
    1854    return propval;
    1855  }
    1856  
    1857  /* Adjust launch dimensions: pick good values for number of blocks and warps
    1858     and ensure that number of warps does not exceed CUDA limits as well as GCC's
    1859     own limits.  */
    1860  
    1861  static void
    1862  nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
    1863  			    struct ptx_device *ptx_dev,
    1864  			    int *teams_p, int *threads_p)
    1865  {
    1866    int max_warps_block = fn->max_threads_per_block / 32;
    1867    /* Maximum 32 warps per block is an implementation limit in NVPTX backend
    1868       and libgcc, which matches documented limit of all GPUs as of 2015.  */
    1869    if (max_warps_block > 32)
    1870      max_warps_block = 32;
    1871    if (*threads_p <= 0)
    1872      *threads_p = 8;
    1873    if (*threads_p > max_warps_block)
    1874      *threads_p = max_warps_block;
    1875  
    1876    int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
    1877    /* This is an estimate of how many blocks the device can host simultaneously.
    1878       Actual limit, which may be lower, can be queried with "occupancy control"
    1879       driver interface (since CUDA 6.0).  */
    1880    int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
    1881    if (*teams_p <= 0 || *teams_p > max_blocks)
    1882      *teams_p = max_blocks;
    1883  }
    1884  
    1885  /* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
    1886     target regions.  */
    1887  
    1888  static size_t
    1889  nvptx_stacks_size ()
    1890  {
    1891    return 128 * 1024;
    1892  }
    1893  
    1894  /* Return contiguous storage for NUM stacks, each SIZE bytes.  The lock for
    1895     the storage should be held on entry, and remains held on exit.  */
    1896  
    1897  static void *
    1898  nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t size, int num)
    1899  {
    1900    if (ptx_dev->omp_stacks.ptr && ptx_dev->omp_stacks.size >= size * num)
    1901      return (void *) ptx_dev->omp_stacks.ptr;
    1902  
    1903    /* Free the old, too-small stacks.  */
    1904    if (ptx_dev->omp_stacks.ptr)
    1905      {
    1906        CUresult r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
    1907        if (r != CUDA_SUCCESS)
    1908  	GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s\n", cuda_error (r));
    1909        r = CUDA_CALL_NOCHECK (cuMemFree, ptx_dev->omp_stacks.ptr);
    1910        if (r != CUDA_SUCCESS)
    1911  	GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
    1912      }
    1913  
    1914    /* Make new and bigger stacks, and remember where we put them and how big
    1915       they are.  */
    1916    CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &ptx_dev->omp_stacks.ptr,
    1917  				  size * num);
    1918    if (r != CUDA_SUCCESS)
    1919      GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
    1920  
    1921    ptx_dev->omp_stacks.size = size * num;
    1922  
    1923    return (void *) ptx_dev->omp_stacks.ptr;
    1924  }
    1925  
    1926  
    1927  void
    1928  rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
    1929  			 CUstream stream)
    1930  {
    1931    CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
    1932    CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
    1933  }
    1934  
    1935  void
    1936  rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
    1937  			 CUstream stream)
    1938  {
    1939    CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
    1940    CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
    1941  }
    1942  
    1943  void
    1944  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
    1945  {
    1946    struct targ_fn_descriptor *tgt_fn_desc
    1947      = (struct targ_fn_descriptor *) tgt_fn;
    1948    CUfunction function = tgt_fn_desc->fn;
    1949    const struct targ_fn_launch *launch = tgt_fn_desc->launch;
    1950    const char *fn_name = launch->fn;
    1951    CUresult r;
    1952    struct ptx_device *ptx_dev = ptx_devices[ord];
    1953    const char *maybe_abort_msg = "(perhaps abort was called)";
    1954    int teams = 0, threads = 0;
    1955  
    1956    if (!args)
    1957      GOMP_PLUGIN_fatal ("No target arguments provided");
    1958    while (*args)
    1959      {
    1960        intptr_t id = (intptr_t) *args++, val;
    1961        if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
    1962  	val = (intptr_t) *args++;
    1963        else
    1964          val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
    1965        if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
    1966  	continue;
    1967        val = val > INT_MAX ? INT_MAX : val;
    1968        id &= GOMP_TARGET_ARG_ID_MASK;
    1969        if (id == GOMP_TARGET_ARG_NUM_TEAMS)
    1970  	teams = val;
    1971        else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
    1972  	threads = val;
    1973      }
    1974    nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
    1975  
    1976    size_t stack_size = nvptx_stacks_size ();
    1977    bool reverse_offload = ptx_dev->rev_data != NULL;
    1978    CUstream copy_stream = NULL;
    1979  
    1980    pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
    1981    void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
    1982    void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
    1983    size_t fn_args_size = sizeof fn_args;
    1984    void *config[] = {
    1985      CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
    1986      CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
    1987      CU_LAUNCH_PARAM_END
    1988    };
    1989    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
    1990  		     " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
    1991  		     __FUNCTION__, fn_name, teams, threads);
    1992    if (reverse_offload)
    1993      CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
    1994    r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
    1995  			 32, threads, 1, 0, NULL, NULL, config);
    1996    if (r != CUDA_SUCCESS)
    1997      GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
    1998    if (reverse_offload)
    1999      while (true)
    2000        {
    2001  	r = CUDA_CALL_NOCHECK (cuStreamQuery, NULL);
    2002  	if (r == CUDA_SUCCESS)
    2003  	  break;
    2004  	if (r == CUDA_ERROR_LAUNCH_FAILED)
    2005  	  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s %s\n", cuda_error (r),
    2006  			     maybe_abort_msg);
    2007  	else if (r != CUDA_ERROR_NOT_READY)
    2008  	  GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
    2009  
    2010  	if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
    2011  	  {
    2012  	    struct rev_offload *rev_data = ptx_dev->rev_data;
    2013  	    GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
    2014  				    rev_data->addrs, rev_data->sizes,
    2015  				    rev_data->kinds, rev_data->dev_num,
    2016  				    rev_off_dev_to_host_cpy,
    2017  				    rev_off_host_to_dev_cpy, copy_stream);
    2018  	    CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
    2019  	    __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
    2020  	  }
    2021  	usleep (1);
    2022        }
    2023    else
    2024      r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
    2025    if (reverse_offload)
    2026      CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
    2027    if (r == CUDA_ERROR_LAUNCH_FAILED)
    2028      GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
    2029  		       maybe_abort_msg);
    2030    else if (r != CUDA_SUCCESS)
    2031      GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
    2032  
    2033    pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
    2034  }
    2035  
    2036  /* TODO: Implement GOMP_OFFLOAD_async_run. */