1  /* Copyright (C) 2015-2023 Free Software Foundation, Inc.
       2     Contributed by Alexander Monakov <amonakov@ispras.ru>
       3  
       4     This file is part of the GNU Offloading and Multi Processing Library
       5     (libgomp).
       6  
       7     Libgomp is free software; you can redistribute it and/or modify it
       8     under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3, or (at your option)
      10     any later version.
      11  
      12     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      13     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      14     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      15     more details.
      16  
      17     Under Section 7 of GPL version 3, you are granted additional
      18     permissions described in the GCC Runtime Library Exception, version
      19     3.1, as published by the Free Software Foundation.
      20  
      21     You should have received a copy of the GNU General Public License and
      22     a copy of the GCC Runtime Library Exception along with this program;
      23     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24     <http://www.gnu.org/licenses/>.  */
      25  
      26  /* This is an NVPTX specific implementation of a barrier synchronization
      27     mechanism for libgomp.  This type is private to the library.  This
      28     implementation uses atomic instructions and bar.sync instruction.  */
      29  
      30  #include <limits.h>
      31  #include "libgomp.h"
      32  
      33  void
      34  gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
      35  {
      36    if (__builtin_expect (state & BAR_WAS_LAST, 0))
      37      {
      38        /* Next time we'll be awaiting TOTAL threads again.  */
      39        bar->awaited = bar->total;
      40        __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
      41  			MEMMODEL_RELEASE);
      42      }
      43    if (bar->total > 1)
      44      asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
      45  }
      46  
      47  void
      48  gomp_barrier_wait (gomp_barrier_t *bar)
      49  {
      50    gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
      51  }
      52  
      53  /* Like gomp_barrier_wait, except that if the encountering thread
      54     is not the last one to hit the barrier, it returns immediately.
      55     The intended usage is that a thread which intends to gomp_barrier_destroy
      56     this barrier calls gomp_barrier_wait, while all other threads
      57     call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
      58     the barrier can be safely destroyed.  */
      59  
      60  void
      61  gomp_barrier_wait_last (gomp_barrier_t *bar)
      62  {
      63    /* The above described behavior matches 'bar.arrive' perfectly.  */
      64    if (bar->total > 1)
      65      asm ("bar.arrive 1, %0;" : : "r" (32 * bar->total));
      66  }
      67  
      68  /* Barriers are implemented mainly using 'bar.red.or', which combines a bar.sync
      69     operation with a OR-reduction of "team->task_count != 0" across all threads.
      70     Task processing is done only after synchronization and verifying that
      71     task_count was non-zero in at least one of the team threads.
      72  
      73     This use of simple-barriers, and queueing of tasks till the end, is deemed
      74     more efficient performance-wise for GPUs in the common offloading case, as
      75     opposed to implementing futex-wait/wake operations to simultaneously process
      76     tasks in a CPU-thread manner (which is not easy to implement efficiently
      77     on GPUs).  */
      78  
      79  void
      80  gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
      81  {
      82    struct gomp_thread *thr = gomp_thread ();
      83    struct gomp_team *team = thr->ts.team;
      84  
      85    bool run_tasks = (team->task_count != 0);
      86    if (bar->total > 1)
      87      run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
      88  					    (team->task_count != 0));
      89  
      90    if (__builtin_expect (state & BAR_WAS_LAST, 0))
      91      {
      92        /* Next time we'll be awaiting TOTAL threads again.  */
      93        bar->awaited = bar->total;
      94        team->work_share_cancelled = 0;
      95      }
      96  
      97    if (__builtin_expect (run_tasks == true, 0))
      98      {
      99        while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
     100  	     & BAR_TASK_PENDING)
     101  	gomp_barrier_handle_tasks (state);
     102  
     103        if (bar->total > 1)
     104  	asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
     105      }
     106  }
     107  
     108  void
     109  gomp_team_barrier_wait (gomp_barrier_t *bar)
     110  {
     111    gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
     112  }
     113  
     114  void
     115  gomp_team_barrier_wait_final (gomp_barrier_t *bar)
     116  {
     117    gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
     118    if (__builtin_expect (state & BAR_WAS_LAST, 0))
     119      bar->awaited_final = bar->total;
     120    gomp_team_barrier_wait_end (bar, state);
     121  }
     122  
     123  /* See also comments for gomp_team_barrier_wait_end.  */
     124  
     125  bool
     126  gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
     127  				   gomp_barrier_state_t state)
     128  {
     129    struct gomp_thread *thr = gomp_thread ();
     130    struct gomp_team *team = thr->ts.team;
     131  
     132    bool run_tasks = (team->task_count != 0);
     133    if (bar->total > 1)
     134      run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
     135  					    (team->task_count != 0));
     136    if (state & BAR_CANCELLED)
     137      return true;
     138  
     139    if (__builtin_expect (state & BAR_WAS_LAST, 0))
     140      {
     141        /* Note: BAR_CANCELLED should never be set in state here, because
     142  	 cancellation means that at least one of the threads has been
     143  	 cancelled, thus on a cancellable barrier we should never see
     144  	 all threads to arrive.  */
     145  
     146        /* Next time we'll be awaiting TOTAL threads again.  */
     147        bar->awaited = bar->total;
     148        team->work_share_cancelled = 0;
     149      }
     150  
     151    if (__builtin_expect (run_tasks == true, 0))
     152      {
     153        while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
     154  	     & BAR_TASK_PENDING)
     155  	gomp_barrier_handle_tasks (state);
     156  
     157        if (bar->total > 1)
     158  	asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
     159      }
     160  
     161    return false;
     162  }
     163  
     164  bool
     165  gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
     166  {
     167    return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
     168  }
     169  
     170  void
     171  gomp_team_barrier_cancel (struct gomp_team *team)
     172  {
     173    gomp_mutex_lock (&team->task_lock);
     174    if (team->barrier.generation & BAR_CANCELLED)
     175      {
     176        gomp_mutex_unlock (&team->task_lock);
     177        return;
     178      }
     179    team->barrier.generation |= BAR_CANCELLED;
     180    gomp_mutex_unlock (&team->task_lock);
     181  
     182    /* The 'exit' instruction cancels this thread and also fullfills any other
     183       CTA threads waiting on barriers.  */
     184    asm volatile ("exit;");
     185  }