1  /* Copyright (C) 2005-2023 Free Software Foundation, Inc.
       2     Contributed by Richard Henderson <rth@redhat.com>.
       3  
       4     This file is part of the GNU Offloading and Multi Processing Library
       5     (libgomp).
       6  
       7     Libgomp is free software; you can redistribute it and/or modify it
       8     under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3, or (at your option)
      10     any later version.
      11  
      12     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      13     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      14     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      15     more details.
      16  
      17     Under Section 7 of GPL version 3, you are granted additional
      18     permissions described in the GCC Runtime Library Exception, version
      19     3.1, as published by the Free Software Foundation.
      20  
      21     You should have received a copy of the GNU General Public License and
      22     a copy of the GCC Runtime Library Exception along with this program;
      23     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24     <http://www.gnu.org/licenses/>.  */
      25  
      26  /* This file contains routines for managing work-share iteration, both
      27     for loops and sections.  */
      28  
      29  #include "libgomp.h"
      30  #include <stdlib.h>
      31  
      32  typedef unsigned long long gomp_ull;
      33  
      34  /* This function implements the STATIC scheduling method.  The caller should
      35     iterate *pstart <= x < *pend.  Return zero if there are more iterations
      36     to perform; nonzero if not.  Return less than 0 if this thread had
      37     received the absolutely last iteration.  */
      38  
      39  int
      40  gomp_iter_ull_static_next (gomp_ull *pstart, gomp_ull *pend)
      41  {
      42    struct gomp_thread *thr = gomp_thread ();
      43    struct gomp_team *team = thr->ts.team;
      44    struct gomp_work_share *ws = thr->ts.work_share;
      45    unsigned long nthreads = team ? team->nthreads : 1;
      46  
      47    if (thr->ts.static_trip == -1)
      48      return -1;
      49  
      50    /* Quick test for degenerate teams and orphaned constructs.  */
      51    if (nthreads == 1)
      52      {
      53        *pstart = ws->next_ull;
      54        *pend = ws->end_ull;
      55        thr->ts.static_trip = -1;
      56        return ws->next_ull == ws->end_ull;
      57      }
      58  
      59    /* We interpret chunk_size zero as "unspecified", which means that we
      60       should break up the iterations such that each thread makes only one
      61       trip through the outer loop.  */
      62    if (ws->chunk_size_ull == 0)
      63      {
      64        gomp_ull n, q, i, t, s0, e0, s, e;
      65  
      66        if (thr->ts.static_trip > 0)
      67  	return 1;
      68  
      69        /* Compute the total number of iterations.  */
      70        if (__builtin_expect (ws->mode, 0) == 0)
      71  	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
      72        else
      73  	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
      74        i = thr->ts.team_id;
      75  
      76        /* Compute the "zero-based" start and end points.  That is, as
      77  	 if the loop began at zero and incremented by one.  */
      78        q = n / nthreads;
      79        t = n % nthreads;
      80        if (i < t)
      81  	{
      82  	  t = 0;
      83  	  q++;
      84  	}
      85        s0 = q * i + t;
      86        e0 = s0 + q;
      87  
      88        /* Notice when no iterations allocated for this thread.  */
      89        if (s0 >= e0)
      90  	{
      91  	  thr->ts.static_trip = 1;
      92  	  return 1;
      93  	}
      94  
      95        /* Transform these to the actual start and end numbers.  */
      96        s = s0 * ws->incr_ull + ws->next_ull;
      97        e = e0 * ws->incr_ull + ws->next_ull;
      98  
      99        *pstart = s;
     100        *pend = e;
     101        thr->ts.static_trip = (e0 == n ? -1 : 1);
     102        return 0;
     103      }
     104    else
     105      {
     106        gomp_ull n, s0, e0, i, c, s, e;
     107  
     108        /* Otherwise, each thread gets exactly chunk_size iterations
     109  	 (if available) each time through the loop.  */
     110  
     111        if (__builtin_expect (ws->mode, 0) == 0)
     112  	n = (ws->end_ull - ws->next_ull + ws->incr_ull - 1) / ws->incr_ull;
     113        else
     114  	n = (ws->next_ull - ws->end_ull - ws->incr_ull - 1) / -ws->incr_ull;
     115        i = thr->ts.team_id;
     116        c = ws->chunk_size_ull;
     117  
     118        /* Initial guess is a C sized chunk positioned nthreads iterations
     119  	 in, offset by our thread number.  */
     120        s0 = (thr->ts.static_trip * (gomp_ull) nthreads + i) * c;
     121        e0 = s0 + c;
     122  
     123        /* Detect overflow.  */
     124        if (s0 >= n)
     125  	return 1;
     126        if (e0 > n)
     127  	e0 = n;
     128  
     129        /* Transform these to the actual start and end numbers.  */
     130        s = s0 * ws->incr_ull + ws->next_ull;
     131        e = e0 * ws->incr_ull + ws->next_ull;
     132  
     133        *pstart = s;
     134        *pend = e;
     135  
     136        if (e0 == n)
     137  	thr->ts.static_trip = -1;
     138        else
     139  	thr->ts.static_trip++;
     140        return 0;
     141      }
     142  }
     143  
     144  
     145  /* This function implements the DYNAMIC scheduling method.  Arguments are
     146     as for gomp_iter_ull_static_next.  This function must be called with
     147     ws->lock held.  */
     148  
     149  bool
     150  gomp_iter_ull_dynamic_next_locked (gomp_ull *pstart, gomp_ull *pend)
     151  {
     152    struct gomp_thread *thr = gomp_thread ();
     153    struct gomp_work_share *ws = thr->ts.work_share;
     154    gomp_ull start, end, chunk, left;
     155  
     156    start = ws->next_ull;
     157    if (start == ws->end_ull)
     158      return false;
     159  
     160    chunk = ws->chunk_size_ull;
     161    left = ws->end_ull - start;
     162    if (__builtin_expect (ws->mode & 2, 0))
     163      {
     164        if (chunk < left)
     165  	chunk = left;
     166      }
     167    else
     168      {
     169        if (chunk > left)
     170  	chunk = left;
     171      }
     172    end = start + chunk;
     173  
     174    ws->next_ull = end;
     175    *pstart = start;
     176    *pend = end;
     177    return true;
     178  }
     179  
     180  
     181  #if defined HAVE_SYNC_BUILTINS && defined __LP64__
     182  /* Similar, but doesn't require the lock held, and uses compare-and-swap
     183     instead.  Note that the only memory value that changes is ws->next_ull.  */
     184  
     185  bool
     186  gomp_iter_ull_dynamic_next (gomp_ull *pstart, gomp_ull *pend)
     187  {
     188    struct gomp_thread *thr = gomp_thread ();
     189    struct gomp_work_share *ws = thr->ts.work_share;
     190    gomp_ull start, end, nend, chunk;
     191  
     192    end = ws->end_ull;
     193    chunk = ws->chunk_size_ull;
     194  
     195    if (__builtin_expect (ws->mode & 1, 1))
     196      {
     197        gomp_ull tmp = __sync_fetch_and_add (&ws->next_ull, chunk);
     198        if (__builtin_expect (ws->mode & 2, 0) == 0)
     199  	{
     200  	  if (tmp >= end)
     201  	    return false;
     202  	  nend = tmp + chunk;
     203  	  if (nend > end)
     204  	    nend = end;
     205  	  *pstart = tmp;
     206  	  *pend = nend;
     207  	  return true;
     208  	}
     209        else
     210  	{
     211  	  if (tmp <= end)
     212  	    return false;
     213  	  nend = tmp + chunk;
     214  	  if (nend < end)
     215  	    nend = end;
     216  	  *pstart = tmp;
     217  	  *pend = nend;
     218  	  return true;
     219  	}
     220      }
     221  
     222    start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
     223    while (1)
     224      {
     225        gomp_ull left = end - start;
     226        gomp_ull tmp;
     227  
     228        if (start == end)
     229  	return false;
     230  
     231        if (__builtin_expect (ws->mode & 2, 0))
     232  	{
     233  	  if (chunk < left)
     234  	    chunk = left;
     235  	}
     236        else
     237  	{
     238  	  if (chunk > left)
     239  	    chunk = left;
     240  	}
     241        nend = start + chunk;
     242  
     243        tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
     244        if (__builtin_expect (tmp == start, 1))
     245  	break;
     246  
     247        start = tmp;
     248      }
     249  
     250    *pstart = start;
     251    *pend = nend;
     252    return true;
     253  }
     254  #endif /* HAVE_SYNC_BUILTINS */
     255  
     256  
     257  /* This function implements the GUIDED scheduling method.  Arguments are
     258     as for gomp_iter_ull_static_next.  This function must be called with the
     259     work share lock held.  */
     260  
     261  bool
     262  gomp_iter_ull_guided_next_locked (gomp_ull *pstart, gomp_ull *pend)
     263  {
     264    struct gomp_thread *thr = gomp_thread ();
     265    struct gomp_work_share *ws = thr->ts.work_share;
     266    struct gomp_team *team = thr->ts.team;
     267    gomp_ull nthreads = team ? team->nthreads : 1;
     268    gomp_ull n, q;
     269    gomp_ull start, end;
     270  
     271    if (ws->next_ull == ws->end_ull)
     272      return false;
     273  
     274    start = ws->next_ull;
     275    if (__builtin_expect (ws->mode, 0) == 0)
     276      n = (ws->end_ull - start) / ws->incr_ull;
     277    else
     278      n = (start - ws->end_ull) / -ws->incr_ull;
     279    q = (n + nthreads - 1) / nthreads;
     280  
     281    if (q < ws->chunk_size_ull)
     282      q = ws->chunk_size_ull;
     283    if (q <= n)
     284      end = start + q * ws->incr_ull;
     285    else
     286      end = ws->end_ull;
     287  
     288    ws->next_ull = end;
     289    *pstart = start;
     290    *pend = end;
     291    return true;
     292  }
     293  
     294  #if defined HAVE_SYNC_BUILTINS && defined __LP64__
     295  /* Similar, but doesn't require the lock held, and uses compare-and-swap
     296     instead.  Note that the only memory value that changes is ws->next_ull.  */
     297  
     298  bool
     299  gomp_iter_ull_guided_next (gomp_ull *pstart, gomp_ull *pend)
     300  {
     301    struct gomp_thread *thr = gomp_thread ();
     302    struct gomp_work_share *ws = thr->ts.work_share;
     303    struct gomp_team *team = thr->ts.team;
     304    gomp_ull nthreads = team ? team->nthreads : 1;
     305    gomp_ull start, end, nend, incr;
     306    gomp_ull chunk_size;
     307  
     308    start = __atomic_load_n (&ws->next_ull, MEMMODEL_RELAXED);
     309    end = ws->end_ull;
     310    incr = ws->incr_ull;
     311    chunk_size = ws->chunk_size_ull;
     312  
     313    while (1)
     314      {
     315        gomp_ull n, q;
     316        gomp_ull tmp;
     317  
     318        if (start == end)
     319  	return false;
     320  
     321        if (__builtin_expect (ws->mode, 0) == 0)
     322  	n = (end - start) / incr;
     323        else
     324  	n = (start - end) / -incr;
     325        q = (n + nthreads - 1) / nthreads;
     326  
     327        if (q < chunk_size)
     328  	q = chunk_size;
     329        if (__builtin_expect (q <= n, 1))
     330  	nend = start + q * incr;
     331        else
     332  	nend = end;
     333  
     334        tmp = __sync_val_compare_and_swap (&ws->next_ull, start, nend);
     335        if (__builtin_expect (tmp == start, 1))
     336  	break;
     337  
     338        start = tmp;
     339      }
     340  
     341    *pstart = start;
     342    *pend = nend;
     343    return true;
     344  }
     345  #endif /* HAVE_SYNC_BUILTINS */