1  /* Copyright (C) 2005-2023 Free Software Foundation, Inc.
       2     Contributed by Richard Henderson <rth@redhat.com>.
       3  
       4     This file is part of the GNU Offloading and Multi Processing Library
       5     (libgomp).
       6  
       7     Libgomp is free software; you can redistribute it and/or modify it
       8     under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3, or (at your option)
      10     any later version.
      11  
      12     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      13     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      14     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      15     more details.
      16  
      17     Under Section 7 of GPL version 3, you are granted additional
      18     permissions described in the GCC Runtime Library Exception, version
      19     3.1, as published by the Free Software Foundation.
      20  
      21     You should have received a copy of the GNU General Public License and
      22     a copy of the GCC Runtime Library Exception along with this program;
      23     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24     <http://www.gnu.org/licenses/>.  */
      25  
      26  /* This file contains routines for managing work-share iteration, both
      27     for loops and sections.  */
      28  
      29  #include "libgomp.h"
      30  #include <stdlib.h>
      31  
      32  
      33  /* This function implements the STATIC scheduling method.  The caller should
      34     iterate *pstart <= x < *pend.  Return zero if there are more iterations
      35     to perform; nonzero if not.  Return less than 0 if this thread had
      36     received the absolutely last iteration.  */
      37  
      38  int
      39  gomp_iter_static_next (long *pstart, long *pend)
      40  {
      41    struct gomp_thread *thr = gomp_thread ();
      42    struct gomp_team *team = thr->ts.team;
      43    struct gomp_work_share *ws = thr->ts.work_share;
      44    unsigned long nthreads = team ? team->nthreads : 1;
      45  
      46    if (thr->ts.static_trip == -1)
      47      return -1;
      48  
      49    /* Quick test for degenerate teams and orphaned constructs.  */
      50    if (nthreads == 1)
      51      {
      52        *pstart = ws->next;
      53        *pend = ws->end;
      54        thr->ts.static_trip = -1;
      55        return ws->next == ws->end;
      56      }
      57  
      58    /* We interpret chunk_size zero as "unspecified", which means that we
      59       should break up the iterations such that each thread makes only one
      60       trip through the outer loop.  */
      61    if (ws->chunk_size == 0)
      62      {
      63        unsigned long n, q, i, t;
      64        unsigned long s0, e0;
      65        long s, e;
      66  
      67        if (thr->ts.static_trip > 0)
      68  	return 1;
      69  
      70        /* Compute the total number of iterations.  */
      71        s = ws->incr + (ws->incr > 0 ? -1 : 1);
      72        n = (ws->end - ws->next + s) / ws->incr;
      73        i = thr->ts.team_id;
      74  
      75        /* Compute the "zero-based" start and end points.  That is, as
      76           if the loop began at zero and incremented by one.  */
      77        q = n / nthreads;
      78        t = n % nthreads;
      79        if (i < t)
      80  	{
      81  	  t = 0;
      82  	  q++;
      83  	}
      84        s0 = q * i + t;
      85        e0 = s0 + q;
      86  
      87        /* Notice when no iterations allocated for this thread.  */
      88        if (s0 >= e0)
      89  	{
      90  	  thr->ts.static_trip = 1;
      91  	  return 1;
      92  	}
      93  
      94        /* Transform these to the actual start and end numbers.  */
      95        s = (long)s0 * ws->incr + ws->next;
      96        e = (long)e0 * ws->incr + ws->next;
      97  
      98        *pstart = s;
      99        *pend = e;
     100        thr->ts.static_trip = (e0 == n ? -1 : 1);
     101        return 0;
     102      }
     103    else
     104      {
     105        unsigned long n, s0, e0, i, c;
     106        long s, e;
     107  
     108        /* Otherwise, each thread gets exactly chunk_size iterations
     109  	 (if available) each time through the loop.  */
     110  
     111        s = ws->incr + (ws->incr > 0 ? -1 : 1);
     112        n = (ws->end - ws->next + s) / ws->incr;
     113        i = thr->ts.team_id;
     114        c = ws->chunk_size;
     115  
     116        /* Initial guess is a C sized chunk positioned nthreads iterations
     117  	 in, offset by our thread number.  */
     118        s0 = (thr->ts.static_trip * nthreads + i) * c;
     119        e0 = s0 + c;
     120  
     121        /* Detect overflow.  */
     122        if (s0 >= n)
     123  	return 1;
     124        if (e0 > n)
     125  	e0 = n;
     126  
     127        /* Transform these to the actual start and end numbers.  */
     128        s = (long)s0 * ws->incr + ws->next;
     129        e = (long)e0 * ws->incr + ws->next;
     130  
     131        *pstart = s;
     132        *pend = e;
     133  
     134        if (e0 == n)
     135  	thr->ts.static_trip = -1;
     136        else
     137  	thr->ts.static_trip++;
     138        return 0;
     139      }
     140  }
     141  
     142  
     143  /* This function implements the DYNAMIC scheduling method.  Arguments are
     144     as for gomp_iter_static_next.  This function must be called with ws->lock
     145     held.  */
     146  
     147  bool
     148  gomp_iter_dynamic_next_locked (long *pstart, long *pend)
     149  {
     150    struct gomp_thread *thr = gomp_thread ();
     151    struct gomp_work_share *ws = thr->ts.work_share;
     152    long start, end, chunk, left;
     153  
     154    start = ws->next;
     155    if (start == ws->end)
     156      return false;
     157  
     158    chunk = ws->chunk_size;
     159    left = ws->end - start;
     160    if (ws->incr < 0)
     161      {
     162        if (chunk < left)
     163  	chunk = left;
     164      }
     165    else
     166      {
     167        if (chunk > left)
     168  	chunk = left;
     169      }
     170    end = start + chunk;
     171  
     172    ws->next = end;
     173    *pstart = start;
     174    *pend = end;
     175    return true;
     176  }
     177  
     178  
     179  #ifdef HAVE_SYNC_BUILTINS
     180  /* Similar, but doesn't require the lock held, and uses compare-and-swap
     181     instead.  Note that the only memory value that changes is ws->next.  */
     182  
     183  bool
     184  gomp_iter_dynamic_next (long *pstart, long *pend)
     185  {
     186    struct gomp_thread *thr = gomp_thread ();
     187    struct gomp_work_share *ws = thr->ts.work_share;
     188    long start, end, nend, chunk, incr;
     189  
     190    end = ws->end;
     191    incr = ws->incr;
     192    chunk = ws->chunk_size;
     193  
     194    if (__builtin_expect (ws->mode, 1))
     195      {
     196        long tmp = __sync_fetch_and_add (&ws->next, chunk);
     197        if (incr > 0)
     198  	{
     199  	  if (tmp >= end)
     200  	    return false;
     201  	  nend = tmp + chunk;
     202  	  if (nend > end)
     203  	    nend = end;
     204  	  *pstart = tmp;
     205  	  *pend = nend;
     206  	  return true;
     207  	}
     208        else
     209  	{
     210  	  if (tmp <= end)
     211  	    return false;
     212  	  nend = tmp + chunk;
     213  	  if (nend < end)
     214  	    nend = end;
     215  	  *pstart = tmp;
     216  	  *pend = nend;
     217  	  return true;
     218  	}
     219      }
     220  
     221    start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
     222    while (1)
     223      {
     224        long left = end - start;
     225        long tmp;
     226  
     227        if (start == end)
     228  	return false;
     229  
     230        if (incr < 0)
     231  	{
     232  	  if (chunk < left)
     233  	    chunk = left;
     234  	}
     235        else
     236  	{
     237  	  if (chunk > left)
     238  	    chunk = left;
     239  	}
     240        nend = start + chunk;
     241  
     242        tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
     243        if (__builtin_expect (tmp == start, 1))
     244  	break;
     245  
     246        start = tmp;
     247      }
     248  
     249    *pstart = start;
     250    *pend = nend;
     251    return true;
     252  }
     253  #endif /* HAVE_SYNC_BUILTINS */
     254  
     255  
     256  /* This function implements the GUIDED scheduling method.  Arguments are
     257     as for gomp_iter_static_next.  This function must be called with the
     258     work share lock held.  */
     259  
     260  bool
     261  gomp_iter_guided_next_locked (long *pstart, long *pend)
     262  {
     263    struct gomp_thread *thr = gomp_thread ();
     264    struct gomp_work_share *ws = thr->ts.work_share;
     265    struct gomp_team *team = thr->ts.team;
     266    unsigned long nthreads = team ? team->nthreads : 1;
     267    unsigned long n, q;
     268    long start, end;
     269  
     270    if (ws->next == ws->end)
     271      return false;
     272  
     273    start = ws->next;
     274    n = (ws->end - start) / ws->incr;
     275    q = (n + nthreads - 1) / nthreads;
     276  
     277    if (q < ws->chunk_size)
     278      q = ws->chunk_size;
     279    if (q <= n)
     280      end = start + q * ws->incr;
     281    else
     282      end = ws->end;
     283  
     284    ws->next = end;
     285    *pstart = start;
     286    *pend = end;
     287    return true;
     288  }
     289  
     290  #ifdef HAVE_SYNC_BUILTINS
     291  /* Similar, but doesn't require the lock held, and uses compare-and-swap
     292     instead.  Note that the only memory value that changes is ws->next.  */
     293  
     294  bool
     295  gomp_iter_guided_next (long *pstart, long *pend)
     296  {
     297    struct gomp_thread *thr = gomp_thread ();
     298    struct gomp_work_share *ws = thr->ts.work_share;
     299    struct gomp_team *team = thr->ts.team;
     300    unsigned long nthreads = team ? team->nthreads : 1;
     301    long start, end, nend, incr;
     302    unsigned long chunk_size;
     303  
     304    start = __atomic_load_n (&ws->next, MEMMODEL_RELAXED);
     305    end = ws->end;
     306    incr = ws->incr;
     307    chunk_size = ws->chunk_size;
     308  
     309    while (1)
     310      {
     311        unsigned long n, q;
     312        long tmp;
     313  
     314        if (start == end)
     315  	return false;
     316  
     317        n = (end - start) / incr;
     318        q = (n + nthreads - 1) / nthreads;
     319  
     320        if (q < chunk_size)
     321  	q = chunk_size;
     322        if (__builtin_expect (q <= n, 1))
     323  	nend = start + q * incr;
     324        else
     325  	nend = end;
     326  
     327        tmp = __sync_val_compare_and_swap (&ws->next, start, nend);
     328        if (__builtin_expect (tmp == start, 1))
     329  	break;
     330  
     331        start = tmp;
     332      }
     333  
     334    *pstart = start;
     335    *pend = nend;
     336    return true;
     337  }
     338  #endif /* HAVE_SYNC_BUILTINS */