1  /* Copyright (C) 2015-2023 Free Software Foundation, Inc.
       2     Contributed by Alexander Monakov <amonakov@ispras.ru>
       3  
       4     This file is part of the GNU Offloading and Multi Processing Library
       5     (libgomp).
       6  
       7     Libgomp is free software; you can redistribute it and/or modify it
       8     under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3, or (at your option)
      10     any later version.
      11  
      12     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      13     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      14     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      15     more details.
      16  
      17     Under Section 7 of GPL version 3, you are granted additional
      18     permissions described in the GCC Runtime Library Exception, version
      19     3.1, as published by the Free Software Foundation.
      20  
      21     You should have received a copy of the GNU General Public License and
      22     a copy of the GCC Runtime Library Exception along with this program;
      23     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24     <http://www.gnu.org/licenses/>.  */
      25  
      26  /* This file handles maintenance of threads on NVPTX.  */
      27  
      28  #if defined __nvptx_softstack__ && defined __nvptx_unisimt__
      29  
      30  #include "libgomp.h"
      31  #include <stdlib.h>
      32  #include <string.h>
      33  
      34  struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
      35  int __gomp_team_num __attribute__((shared,nocommon));
      36  
      37  static void gomp_thread_start (struct gomp_thread_pool *);
      38  
      39  
      40  /* This externally visible function handles target region entry.  It
      41     sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
      42     in the master thread or gomp_thread_start in other threads.
      43  
      44     The name of this function is part of the interface with the compiler: for
      45     each target region, GCC emits a PTX .kernel function that sets up soft-stack
      46     and uniform-simt state and calls this function, passing in FN the original
      47     function outlined for the target region.  */
      48  
      49  void
      50  gomp_nvptx_main (void (*fn) (void *), void *fn_data)
      51  {
      52    int tid, ntids;
      53    asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
      54    asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
      55    if (tid == 0)
      56      {
      57        gomp_global_icv.nthreads_var = ntids;
      58        gomp_global_icv.thread_limit_var = ntids;
      59        /* Starting additional threads is not supported.  */
      60        gomp_global_icv.dyn_var = true;
      61  
      62        __gomp_team_num = 0;
      63        nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
      64        memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
      65  
      66        struct gomp_thread_pool *pool = alloca (sizeof (*pool));
      67        pool->threads = alloca (ntids * sizeof (*pool->threads));
      68        for (tid = 0; tid < ntids; tid++)
      69  	pool->threads[tid] = nvptx_thrs + tid;
      70        pool->threads_size = ntids;
      71        pool->threads_used = ntids;
      72        pool->threads_busy = 1;
      73        pool->last_team = NULL;
      74        gomp_simple_barrier_init (&pool->threads_dock, ntids);
      75  
      76        nvptx_thrs[0].thread_pool = pool;
      77        asm ("bar.sync 0;");
      78        fn (fn_data);
      79  
      80        gomp_free_thread (nvptx_thrs);
      81      }
      82    else
      83      {
      84        asm ("bar.sync 0;");
      85        gomp_thread_start (nvptx_thrs[0].thread_pool);
      86      }
      87  }
      88  
      89  /* This function contains the idle loop in which a thread waits
      90     to be called up to become part of a team.  */
      91  
      92  static void
      93  gomp_thread_start (struct gomp_thread_pool *pool)
      94  {
      95    struct gomp_thread *thr = gomp_thread ();
      96  
      97    gomp_sem_init (&thr->release, 0);
      98    thr->thread_pool = pool;
      99  
     100    do
     101      {
     102        gomp_simple_barrier_wait (&pool->threads_dock);
     103        if (!thr->fn)
     104  	continue;
     105        thr->fn (thr->data);
     106        thr->fn = NULL;
     107  
     108        struct gomp_task *task = thr->task;
     109        gomp_team_barrier_wait_final (&thr->ts.team->barrier);
     110        gomp_finish_task (task);
     111      }
     112    /* Work around an NVIDIA driver bug: when generating sm_50 machine code,
     113       it can trash stack pointer R1 in loops lacking exit edges.  Add a cheap
     114       artificial exit that the driver would not be able to optimize out.  */
     115    while (nvptx_thrs);
     116  }
     117  
     118  /* Launch a team.  */
     119  
     120  void
     121  gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
     122  		 unsigned flags, struct gomp_team *team,
     123  		 struct gomp_taskgroup *taskgroup)
     124  {
     125    struct gomp_thread *thr, *nthr;
     126    struct gomp_task *task;
     127    struct gomp_task_icv *icv;
     128    struct gomp_thread_pool *pool;
     129    unsigned long nthreads_var;
     130  
     131    thr = gomp_thread ();
     132    pool = thr->thread_pool;
     133    task = thr->task;
     134    icv = task ? &task->icv : &gomp_global_icv;
     135  
     136    /* Always save the previous state, even if this isn't a nested team.
     137       In particular, we should save any work share state from an outer
     138       orphaned work share construct.  */
     139    team->prev_ts = thr->ts;
     140  
     141    thr->ts.team = team;
     142    thr->ts.team_id = 0;
     143    ++thr->ts.level;
     144    if (nthreads > 1)
     145      ++thr->ts.active_level;
     146    thr->ts.work_share = &team->work_shares[0];
     147    thr->ts.last_work_share = NULL;
     148    thr->ts.single_count = 0;
     149    thr->ts.static_trip = 0;
     150    thr->task = &team->implicit_task[0];
     151    nthreads_var = icv->nthreads_var;
     152    gomp_init_task (thr->task, task, icv);
     153    team->implicit_task[0].icv.nthreads_var = nthreads_var;
     154    team->implicit_task[0].taskgroup = taskgroup;
     155  
     156    if (nthreads == 1)
     157      return;
     158  
     159    /* Release existing idle threads.  */
     160    for (unsigned i = 1; i < nthreads; ++i)
     161      {
     162        nthr = pool->threads[i];
     163        nthr->ts.team = team;
     164        nthr->ts.work_share = &team->work_shares[0];
     165        nthr->ts.last_work_share = NULL;
     166        nthr->ts.team_id = i;
     167        nthr->ts.level = team->prev_ts.level + 1;
     168        nthr->ts.active_level = thr->ts.active_level;
     169        nthr->ts.single_count = 0;
     170        nthr->ts.static_trip = 0;
     171        nthr->task = &team->implicit_task[i];
     172        gomp_init_task (nthr->task, task, icv);
     173        team->implicit_task[i].icv.nthreads_var = nthreads_var;
     174        team->implicit_task[i].taskgroup = taskgroup;
     175        nthr->fn = fn;
     176        nthr->data = data;
     177        team->ordered_release[i] = &nthr->release;
     178      }
     179  
     180    gomp_simple_barrier_wait (&pool->threads_dock);
     181  }
     182  
     183  int
     184  gomp_pause_host (void)
     185  {
     186    return -1;
     187  }
     188  
     189  #include "../../team.c"
     190  #endif