1  /* Copyright (C) 2005-2023 Free Software Foundation, Inc.
       2     Contributed by Richard Henderson <rth@redhat.com>.
       3  
       4     This file is part of the GNU Offloading and Multi Processing Library
       5     (libgomp).
       6  
       7     Libgomp is free software; you can redistribute it and/or modify it
       8     under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3, or (at your option)
      10     any later version.
      11  
      12     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      13     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      14     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      15     more details.
      16  
      17     Under Section 7 of GPL version 3, you are granted additional
      18     permissions described in the GCC Runtime Library Exception, version
      19     3.1, as published by the Free Software Foundation.
      20  
      21     You should have received a copy of the GNU General Public License and
      22     a copy of the GCC Runtime Library Exception along with this program;
      23     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24     <http://www.gnu.org/licenses/>.  */
      25  
      26  /* This file handles the LOOP (FOR/DO) construct.  */
      27  
      28  #include <limits.h>
      29  #include <stdlib.h>
      30  #include <string.h>
      31  #include "libgomp.h"
      32  
      33  
      34  ialias (GOMP_loop_runtime_next)
      35  ialias_redirect (GOMP_taskgroup_reduction_register)
      36  
      37  /* Initialize the given work share construct from the given arguments.  */
      38  
      39  static inline void
      40  gomp_loop_init (struct gomp_work_share *ws, long start, long end, long incr,
      41  		enum gomp_schedule_type sched, long chunk_size)
      42  {
      43    ws->sched = sched;
      44    ws->chunk_size = chunk_size;
      45    /* Canonicalize loops that have zero iterations to ->next == ->end.  */
      46    ws->end = ((incr > 0 && start > end) || (incr < 0 && start < end))
      47  	    ? start : end;
      48    ws->incr = incr;
      49    ws->next = start;
      50    if (sched == GFS_DYNAMIC)
      51      {
      52        ws->chunk_size *= incr;
      53  
      54  #ifdef HAVE_SYNC_BUILTINS
      55        {
      56  	/* For dynamic scheduling prepare things to make each iteration
      57  	   faster.  */
      58  	struct gomp_thread *thr = gomp_thread ();
      59  	struct gomp_team *team = thr->ts.team;
      60  	long nthreads = team ? team->nthreads : 1;
      61  
      62  	if (__builtin_expect (incr > 0, 1))
      63  	  {
      64  	    /* Cheap overflow protection.  */
      65  	    if (__builtin_expect ((nthreads | ws->chunk_size)
      66  				  >= 1UL << (sizeof (long)
      67  					     * __CHAR_BIT__ / 2 - 1), 0))
      68  	      ws->mode = 0;
      69  	    else
      70  	      ws->mode = ws->end < (LONG_MAX
      71  				    - (nthreads + 1) * ws->chunk_size);
      72  	  }
      73  	/* Cheap overflow protection.  */
      74  	else if (__builtin_expect ((nthreads | -ws->chunk_size)
      75  				   >= 1UL << (sizeof (long)
      76  					      * __CHAR_BIT__ / 2 - 1), 0))
      77  	  ws->mode = 0;
      78  	else
      79  	  ws->mode = ws->end > (nthreads + 1) * -ws->chunk_size - LONG_MAX;
      80        }
      81  #endif
      82      }
      83  }
      84  
      85  /* The *_start routines are called when first encountering a loop construct
      86     that is not bound directly to a parallel construct.  The first thread
      87     that arrives will create the work-share construct; subsequent threads
      88     will see the construct exists and allocate work from it.
      89  
      90     START, END, INCR are the bounds of the loop; due to the restrictions of
      91     OpenMP, these values must be the same in every thread.  This is not
      92     verified (nor is it entirely verifiable, since START is not necessarily
      93     retained intact in the work-share data structure).  CHUNK_SIZE is the
      94     scheduling parameter; again this must be identical in all threads.
      95  
      96     Returns true if there's any work for this thread to perform.  If so,
      97     *ISTART and *IEND are filled with the bounds of the iteration block
      98     allocated to this thread.  Returns false if all work was assigned to
      99     other threads prior to this thread's arrival.  */
     100  
     101  static bool
     102  gomp_loop_static_start (long start, long end, long incr, long chunk_size,
     103  			long *istart, long *iend)
     104  {
     105    struct gomp_thread *thr = gomp_thread ();
     106  
     107    thr->ts.static_trip = 0;
     108    if (gomp_work_share_start (0))
     109      {
     110        gomp_loop_init (thr->ts.work_share, start, end, incr,
     111  		      GFS_STATIC, chunk_size);
     112        gomp_work_share_init_done ();
     113      }
     114  
     115    return !gomp_iter_static_next (istart, iend);
     116  }
     117  
     118  /* The current dynamic implementation is always monotonic.  The
     119     entrypoints without nonmonotonic in them have to be always monotonic,
     120     but the nonmonotonic ones could be changed to use work-stealing for
     121     improved scalability.  */
     122  
     123  static bool
     124  gomp_loop_dynamic_start (long start, long end, long incr, long chunk_size,
     125  			 long *istart, long *iend)
     126  {
     127    struct gomp_thread *thr = gomp_thread ();
     128    bool ret;
     129  
     130    if (gomp_work_share_start (0))
     131      {
     132        gomp_loop_init (thr->ts.work_share, start, end, incr,
     133  		      GFS_DYNAMIC, chunk_size);
     134        gomp_work_share_init_done ();
     135      }
     136  
     137  #ifdef HAVE_SYNC_BUILTINS
     138    ret = gomp_iter_dynamic_next (istart, iend);
     139  #else
     140    gomp_mutex_lock (&thr->ts.work_share->lock);
     141    ret = gomp_iter_dynamic_next_locked (istart, iend);
     142    gomp_mutex_unlock (&thr->ts.work_share->lock);
     143  #endif
     144  
     145    return ret;
     146  }
     147  
     148  /* Similarly as for dynamic, though the question is how can the chunk sizes
     149     be decreased without a central locking or atomics.  */
     150  
     151  static bool
     152  gomp_loop_guided_start (long start, long end, long incr, long chunk_size,
     153  			long *istart, long *iend)
     154  {
     155    struct gomp_thread *thr = gomp_thread ();
     156    bool ret;
     157  
     158    if (gomp_work_share_start (0))
     159      {
     160        gomp_loop_init (thr->ts.work_share, start, end, incr,
     161  		      GFS_GUIDED, chunk_size);
     162        gomp_work_share_init_done ();
     163      }
     164  
     165  #ifdef HAVE_SYNC_BUILTINS
     166    ret = gomp_iter_guided_next (istart, iend);
     167  #else
     168    gomp_mutex_lock (&thr->ts.work_share->lock);
     169    ret = gomp_iter_guided_next_locked (istart, iend);
     170    gomp_mutex_unlock (&thr->ts.work_share->lock);
     171  #endif
     172  
     173    return ret;
     174  }
     175  
     176  bool
     177  GOMP_loop_runtime_start (long start, long end, long incr,
     178  			 long *istart, long *iend)
     179  {
     180    struct gomp_task_icv *icv = gomp_icv (false);
     181    switch (icv->run_sched_var & ~GFS_MONOTONIC)
     182      {
     183      case GFS_STATIC:
     184        return gomp_loop_static_start (start, end, incr,
     185  				     icv->run_sched_chunk_size,
     186  				     istart, iend);
     187      case GFS_DYNAMIC:
     188        return gomp_loop_dynamic_start (start, end, incr,
     189  				      icv->run_sched_chunk_size,
     190  				      istart, iend);
     191      case GFS_GUIDED:
     192        return gomp_loop_guided_start (start, end, incr,
     193  				     icv->run_sched_chunk_size,
     194  				     istart, iend);
     195      case GFS_AUTO:
     196        /* For now map to schedule(static), later on we could play with feedback
     197  	 driven choice.  */
     198        return gomp_loop_static_start (start, end, incr, 0, istart, iend);
     199      default:
     200        abort ();
     201      }
     202  }
     203  
     204  static long
     205  gomp_adjust_sched (long sched, long *chunk_size)
     206  {
     207    sched &= ~GFS_MONOTONIC;
     208    switch (sched)
     209      {
     210      case GFS_STATIC:
     211      case GFS_DYNAMIC:
     212      case GFS_GUIDED:
     213        return sched;
     214      /* GFS_RUNTIME is used for runtime schedule without monotonic
     215         or nonmonotonic modifiers on the clause.
     216         GFS_RUNTIME|GFS_MONOTONIC for runtime schedule with monotonic
     217         modifier.  */
     218      case GFS_RUNTIME:
     219      /* GFS_AUTO is used for runtime schedule with nonmonotonic
     220         modifier.  */
     221      case GFS_AUTO:
     222        {
     223  	struct gomp_task_icv *icv = gomp_icv (false);
     224  	sched = icv->run_sched_var & ~GFS_MONOTONIC;
     225  	switch (sched)
     226  	  {
     227  	  case GFS_STATIC:
     228  	  case GFS_DYNAMIC:
     229  	  case GFS_GUIDED:
     230  	    *chunk_size = icv->run_sched_chunk_size;
     231  	    break;
     232  	  case GFS_AUTO:
     233  	    sched = GFS_STATIC;
     234  	    *chunk_size = 0;
     235  	    break;
     236  	  default:
     237  	    abort ();
     238  	  }
     239  	return sched;
     240        }
     241      default:
     242        abort ();
     243      }
     244  }
     245  
     246  bool
     247  GOMP_loop_start (long start, long end, long incr, long sched,
     248  		 long chunk_size, long *istart, long *iend,
     249  		 uintptr_t *reductions, void **mem)
     250  {
     251    struct gomp_thread *thr = gomp_thread ();
     252  
     253    thr->ts.static_trip = 0;
     254    if (reductions)
     255      gomp_workshare_taskgroup_start ();
     256    if (gomp_work_share_start (0))
     257      {
     258        sched = gomp_adjust_sched (sched, &chunk_size);
     259        gomp_loop_init (thr->ts.work_share, start, end, incr,
     260  		      sched, chunk_size);
     261        if (reductions)
     262  	{
     263  	  GOMP_taskgroup_reduction_register (reductions);
     264  	  thr->task->taskgroup->workshare = true;
     265  	  thr->ts.work_share->task_reductions = reductions;
     266  	}
     267        if (mem)
     268  	{
     269  	  uintptr_t size = (uintptr_t) *mem;
     270  #define INLINE_ORDERED_TEAM_IDS_OFF \
     271    ((offsetof (struct gomp_work_share, inline_ordered_team_ids)		\
     272      + __alignof__ (long long) - 1) & ~(__alignof__ (long long) - 1))
     273  	  if (sizeof (struct gomp_work_share)
     274  	      <= INLINE_ORDERED_TEAM_IDS_OFF
     275  	      || __alignof__ (struct gomp_work_share) < __alignof__ (long long)
     276  	      || size > (sizeof (struct gomp_work_share)
     277  			- INLINE_ORDERED_TEAM_IDS_OFF))
     278  	    *mem
     279  	      = (void *) (thr->ts.work_share->ordered_team_ids
     280  			  = gomp_malloc_cleared (size));
     281  	  else
     282  	    *mem = memset (((char *) thr->ts.work_share)
     283  			   + INLINE_ORDERED_TEAM_IDS_OFF, '\0', size);
     284  	}
     285        gomp_work_share_init_done ();
     286      }
     287    else
     288      {
     289        if (reductions)
     290  	{
     291  	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
     292  	  gomp_workshare_task_reduction_register (reductions,
     293  						  first_reductions);
     294  	}
     295        if (mem)
     296  	{
     297  	  if ((offsetof (struct gomp_work_share, inline_ordered_team_ids)
     298  	       & (__alignof__ (long long) - 1)) == 0)
     299  	    *mem = (void *) thr->ts.work_share->ordered_team_ids;
     300  	  else
     301  	    {
     302  	      uintptr_t p = (uintptr_t) thr->ts.work_share->ordered_team_ids;
     303  	      p += __alignof__ (long long) - 1;
     304  	      p &= ~(__alignof__ (long long) - 1);
     305  	      *mem = (void *) p;
     306  	    }
     307  	}
     308      }
     309  
     310    if (!istart)
     311      return true;
     312    return ialias_call (GOMP_loop_runtime_next) (istart, iend);
     313  }
     314  
     315  /* The *_ordered_*_start routines are similar.  The only difference is that
     316     this work-share construct is initialized to expect an ORDERED section.  */
     317  
     318  static bool
     319  gomp_loop_ordered_static_start (long start, long end, long incr,
     320  				long chunk_size, long *istart, long *iend)
     321  {
     322    struct gomp_thread *thr = gomp_thread ();
     323  
     324    thr->ts.static_trip = 0;
     325    if (gomp_work_share_start (1))
     326      {
     327        gomp_loop_init (thr->ts.work_share, start, end, incr,
     328  		      GFS_STATIC, chunk_size);
     329        gomp_ordered_static_init ();
     330        gomp_work_share_init_done ();
     331      }
     332  
     333    return !gomp_iter_static_next (istart, iend);
     334  }
     335  
     336  static bool
     337  gomp_loop_ordered_dynamic_start (long start, long end, long incr,
     338  				 long chunk_size, long *istart, long *iend)
     339  {
     340    struct gomp_thread *thr = gomp_thread ();
     341    bool ret;
     342  
     343    if (gomp_work_share_start (1))
     344      {
     345        gomp_loop_init (thr->ts.work_share, start, end, incr,
     346  		      GFS_DYNAMIC, chunk_size);
     347        gomp_mutex_lock (&thr->ts.work_share->lock);
     348        gomp_work_share_init_done ();
     349      }
     350    else
     351      gomp_mutex_lock (&thr->ts.work_share->lock);
     352  
     353    ret = gomp_iter_dynamic_next_locked (istart, iend);
     354    if (ret)
     355      gomp_ordered_first ();
     356    gomp_mutex_unlock (&thr->ts.work_share->lock);
     357  
     358    return ret;
     359  }
     360  
     361  static bool
     362  gomp_loop_ordered_guided_start (long start, long end, long incr,
     363  				long chunk_size, long *istart, long *iend)
     364  {
     365    struct gomp_thread *thr = gomp_thread ();
     366    bool ret;
     367  
     368    if (gomp_work_share_start (1))
     369      {
     370        gomp_loop_init (thr->ts.work_share, start, end, incr,
     371  		      GFS_GUIDED, chunk_size);
     372        gomp_mutex_lock (&thr->ts.work_share->lock);
     373        gomp_work_share_init_done ();
     374      }
     375    else
     376      gomp_mutex_lock (&thr->ts.work_share->lock);
     377  
     378    ret = gomp_iter_guided_next_locked (istart, iend);
     379    if (ret)
     380      gomp_ordered_first ();
     381    gomp_mutex_unlock (&thr->ts.work_share->lock);
     382  
     383    return ret;
     384  }
     385  
     386  bool
     387  GOMP_loop_ordered_runtime_start (long start, long end, long incr,
     388  				 long *istart, long *iend)
     389  {
     390    struct gomp_task_icv *icv = gomp_icv (false);
     391    switch (icv->run_sched_var & ~GFS_MONOTONIC)
     392      {
     393      case GFS_STATIC:
     394        return gomp_loop_ordered_static_start (start, end, incr,
     395  					     icv->run_sched_chunk_size,
     396  					     istart, iend);
     397      case GFS_DYNAMIC:
     398        return gomp_loop_ordered_dynamic_start (start, end, incr,
     399  					      icv->run_sched_chunk_size,
     400  					      istart, iend);
     401      case GFS_GUIDED:
     402        return gomp_loop_ordered_guided_start (start, end, incr,
     403  					     icv->run_sched_chunk_size,
     404  					     istart, iend);
     405      case GFS_AUTO:
     406        /* For now map to schedule(static), later on we could play with feedback
     407  	 driven choice.  */
     408        return gomp_loop_ordered_static_start (start, end, incr,
     409  					     0, istart, iend);
     410      default:
     411        abort ();
     412      }
     413  }
     414  
     415  bool
     416  GOMP_loop_ordered_start (long start, long end, long incr, long sched,
     417  			 long chunk_size, long *istart, long *iend,
     418  			 uintptr_t *reductions, void **mem)
     419  {
     420    struct gomp_thread *thr = gomp_thread ();
     421    size_t ordered = 1;
     422    bool ret;
     423  
     424    thr->ts.static_trip = 0;
     425    if (reductions)
     426      gomp_workshare_taskgroup_start ();
     427    if (mem)
     428      ordered += (uintptr_t) *mem;
     429    if (gomp_work_share_start (ordered))
     430      {
     431        sched = gomp_adjust_sched (sched, &chunk_size);
     432        gomp_loop_init (thr->ts.work_share, start, end, incr,
     433  		      sched, chunk_size);
     434        if (reductions)
     435  	{
     436  	  GOMP_taskgroup_reduction_register (reductions);
     437  	  thr->task->taskgroup->workshare = true;
     438  	  thr->ts.work_share->task_reductions = reductions;
     439  	}
     440        if (sched == GFS_STATIC)
     441  	gomp_ordered_static_init ();
     442        else
     443  	gomp_mutex_lock (&thr->ts.work_share->lock);
     444        gomp_work_share_init_done ();
     445      }
     446    else
     447      {
     448        if (reductions)
     449  	{
     450  	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
     451  	  gomp_workshare_task_reduction_register (reductions,
     452  						  first_reductions);
     453  	}
     454        sched = thr->ts.work_share->sched;
     455        if (sched != GFS_STATIC)
     456  	gomp_mutex_lock (&thr->ts.work_share->lock);
     457      }
     458  
     459    if (mem)
     460      {
     461        uintptr_t p
     462  	= (uintptr_t) (thr->ts.work_share->ordered_team_ids
     463  		       + (thr->ts.team ? thr->ts.team->nthreads : 1));
     464        p += __alignof__ (long long) - 1;
     465        p &= ~(__alignof__ (long long) - 1);
     466        *mem = (void *) p;
     467      }
     468  
     469    switch (sched)
     470      {
     471      case GFS_STATIC:
     472      case GFS_AUTO:
     473        return !gomp_iter_static_next (istart, iend);
     474      case GFS_DYNAMIC:
     475        ret = gomp_iter_dynamic_next_locked (istart, iend);
     476        break;
     477      case GFS_GUIDED:
     478        ret = gomp_iter_guided_next_locked (istart, iend);
     479        break;
     480      default:
     481        abort ();
     482      }
     483  
     484    if (ret)
     485      gomp_ordered_first ();
     486    gomp_mutex_unlock (&thr->ts.work_share->lock);
     487    return ret;
     488  }
     489  
     490  /* The *_doacross_*_start routines are similar.  The only difference is that
     491     this work-share construct is initialized to expect an ORDERED(N) - DOACROSS
     492     section, and the worksharing loop iterates always from 0 to COUNTS[0] - 1
     493     and other COUNTS array elements tell the library number of iterations
     494     in the ordered inner loops.  */
     495  
     496  static bool
     497  gomp_loop_doacross_static_start (unsigned ncounts, long *counts,
     498  				 long chunk_size, long *istart, long *iend)
     499  {
     500    struct gomp_thread *thr = gomp_thread ();
     501  
     502    thr->ts.static_trip = 0;
     503    if (gomp_work_share_start (0))
     504      {
     505        gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
     506  		      GFS_STATIC, chunk_size);
     507        gomp_doacross_init (ncounts, counts, chunk_size, 0);
     508        gomp_work_share_init_done ();
     509      }
     510  
     511    return !gomp_iter_static_next (istart, iend);
     512  }
     513  
     514  static bool
     515  gomp_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
     516  				  long chunk_size, long *istart, long *iend)
     517  {
     518    struct gomp_thread *thr = gomp_thread ();
     519    bool ret;
     520  
     521    if (gomp_work_share_start (0))
     522      {
     523        gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
     524  		      GFS_DYNAMIC, chunk_size);
     525        gomp_doacross_init (ncounts, counts, chunk_size, 0);
     526        gomp_work_share_init_done ();
     527      }
     528  
     529  #ifdef HAVE_SYNC_BUILTINS
     530    ret = gomp_iter_dynamic_next (istart, iend);
     531  #else
     532    gomp_mutex_lock (&thr->ts.work_share->lock);
     533    ret = gomp_iter_dynamic_next_locked (istart, iend);
     534    gomp_mutex_unlock (&thr->ts.work_share->lock);
     535  #endif
     536  
     537    return ret;
     538  }
     539  
     540  static bool
     541  gomp_loop_doacross_guided_start (unsigned ncounts, long *counts,
     542  				 long chunk_size, long *istart, long *iend)
     543  {
     544    struct gomp_thread *thr = gomp_thread ();
     545    bool ret;
     546  
     547    if (gomp_work_share_start (0))
     548      {
     549        gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
     550  		      GFS_GUIDED, chunk_size);
     551        gomp_doacross_init (ncounts, counts, chunk_size, 0);
     552        gomp_work_share_init_done ();
     553      }
     554  
     555  #ifdef HAVE_SYNC_BUILTINS
     556    ret = gomp_iter_guided_next (istart, iend);
     557  #else
     558    gomp_mutex_lock (&thr->ts.work_share->lock);
     559    ret = gomp_iter_guided_next_locked (istart, iend);
     560    gomp_mutex_unlock (&thr->ts.work_share->lock);
     561  #endif
     562  
     563    return ret;
     564  }
     565  
     566  bool
     567  GOMP_loop_doacross_runtime_start (unsigned ncounts, long *counts,
     568  				  long *istart, long *iend)
     569  {
     570    struct gomp_task_icv *icv = gomp_icv (false);
     571    switch (icv->run_sched_var & ~GFS_MONOTONIC)
     572      {
     573      case GFS_STATIC:
     574        return gomp_loop_doacross_static_start (ncounts, counts,
     575  					      icv->run_sched_chunk_size,
     576  					      istart, iend);
     577      case GFS_DYNAMIC:
     578        return gomp_loop_doacross_dynamic_start (ncounts, counts,
     579  					       icv->run_sched_chunk_size,
     580  					       istart, iend);
     581      case GFS_GUIDED:
     582        return gomp_loop_doacross_guided_start (ncounts, counts,
     583  					      icv->run_sched_chunk_size,
     584  					      istart, iend);
     585      case GFS_AUTO:
     586        /* For now map to schedule(static), later on we could play with feedback
     587  	 driven choice.  */
     588        return gomp_loop_doacross_static_start (ncounts, counts,
     589  					      0, istart, iend);
     590      default:
     591        abort ();
     592      }
     593  }
     594  
     595  bool
     596  GOMP_loop_doacross_start (unsigned ncounts, long *counts, long sched,
     597  			  long chunk_size, long *istart, long *iend,
     598  			  uintptr_t *reductions, void **mem)
     599  {
     600    struct gomp_thread *thr = gomp_thread ();
     601  
     602    thr->ts.static_trip = 0;
     603    if (reductions)
     604      gomp_workshare_taskgroup_start ();
     605    if (gomp_work_share_start (0))
     606      {
     607        size_t extra = 0;
     608        if (mem)
     609  	extra = (uintptr_t) *mem;
     610        sched = gomp_adjust_sched (sched, &chunk_size);
     611        gomp_loop_init (thr->ts.work_share, 0, counts[0], 1,
     612  		      sched, chunk_size);
     613        gomp_doacross_init (ncounts, counts, chunk_size, extra);
     614        if (reductions)
     615  	{
     616  	  GOMP_taskgroup_reduction_register (reductions);
     617  	  thr->task->taskgroup->workshare = true;
     618  	  thr->ts.work_share->task_reductions = reductions;
     619  	}
     620        gomp_work_share_init_done ();
     621      }
     622    else
     623      {
     624        if (reductions)
     625  	{
     626  	  uintptr_t *first_reductions = thr->ts.work_share->task_reductions;
     627  	  gomp_workshare_task_reduction_register (reductions,
     628  						  first_reductions);
     629  	}
     630        sched = thr->ts.work_share->sched;
     631      }
     632  
     633    if (mem)
     634      *mem = thr->ts.work_share->doacross->extra;
     635  
     636    return ialias_call (GOMP_loop_runtime_next) (istart, iend);
     637  }
     638  
     639  /* The *_next routines are called when the thread completes processing of
     640     the iteration block currently assigned to it.  If the work-share
     641     construct is bound directly to a parallel construct, then the iteration
     642     bounds may have been set up before the parallel.  In which case, this
     643     may be the first iteration for the thread.
     644  
     645     Returns true if there is work remaining to be performed; *ISTART and
     646     *IEND are filled with a new iteration block.  Returns false if all work
     647     has been assigned.  */
     648  
     649  static bool
     650  gomp_loop_static_next (long *istart, long *iend)
     651  {
     652    return !gomp_iter_static_next (istart, iend);
     653  }
     654  
     655  static bool
     656  gomp_loop_dynamic_next (long *istart, long *iend)
     657  {
     658    bool ret;
     659  
     660  #ifdef HAVE_SYNC_BUILTINS
     661    ret = gomp_iter_dynamic_next (istart, iend);
     662  #else
     663    struct gomp_thread *thr = gomp_thread ();
     664    gomp_mutex_lock (&thr->ts.work_share->lock);
     665    ret = gomp_iter_dynamic_next_locked (istart, iend);
     666    gomp_mutex_unlock (&thr->ts.work_share->lock);
     667  #endif
     668  
     669    return ret;
     670  }
     671  
     672  static bool
     673  gomp_loop_guided_next (long *istart, long *iend)
     674  {
     675    bool ret;
     676  
     677  #ifdef HAVE_SYNC_BUILTINS
     678    ret = gomp_iter_guided_next (istart, iend);
     679  #else
     680    struct gomp_thread *thr = gomp_thread ();
     681    gomp_mutex_lock (&thr->ts.work_share->lock);
     682    ret = gomp_iter_guided_next_locked (istart, iend);
     683    gomp_mutex_unlock (&thr->ts.work_share->lock);
     684  #endif
     685  
     686    return ret;
     687  }
     688  
     689  bool
     690  GOMP_loop_runtime_next (long *istart, long *iend)
     691  {
     692    struct gomp_thread *thr = gomp_thread ();
     693  
     694    switch (thr->ts.work_share->sched)
     695      {
     696      case GFS_STATIC:
     697      case GFS_AUTO:
     698        return gomp_loop_static_next (istart, iend);
     699      case GFS_DYNAMIC:
     700        return gomp_loop_dynamic_next (istart, iend);
     701      case GFS_GUIDED:
     702        return gomp_loop_guided_next (istart, iend);
     703      default:
     704        abort ();
     705      }
     706  }
     707  
     708  /* The *_ordered_*_next routines are called when the thread completes
     709     processing of the iteration block currently assigned to it.
     710  
     711     Returns true if there is work remaining to be performed; *ISTART and
     712     *IEND are filled with a new iteration block.  Returns false if all work
     713     has been assigned.  */
     714  
     715  static bool
     716  gomp_loop_ordered_static_next (long *istart, long *iend)
     717  {
     718    struct gomp_thread *thr = gomp_thread ();
     719    int test;
     720  
     721    gomp_ordered_sync ();
     722    gomp_mutex_lock (&thr->ts.work_share->lock);
     723    test = gomp_iter_static_next (istart, iend);
     724    if (test >= 0)
     725      gomp_ordered_static_next ();
     726    gomp_mutex_unlock (&thr->ts.work_share->lock);
     727  
     728    return test == 0;
     729  }
     730  
     731  static bool
     732  gomp_loop_ordered_dynamic_next (long *istart, long *iend)
     733  {
     734    struct gomp_thread *thr = gomp_thread ();
     735    bool ret;
     736  
     737    gomp_ordered_sync ();
     738    gomp_mutex_lock (&thr->ts.work_share->lock);
     739    ret = gomp_iter_dynamic_next_locked (istart, iend);
     740    if (ret)
     741      gomp_ordered_next ();
     742    else
     743      gomp_ordered_last ();
     744    gomp_mutex_unlock (&thr->ts.work_share->lock);
     745  
     746    return ret;
     747  }
     748  
     749  static bool
     750  gomp_loop_ordered_guided_next (long *istart, long *iend)
     751  {
     752    struct gomp_thread *thr = gomp_thread ();
     753    bool ret;
     754  
     755    gomp_ordered_sync ();
     756    gomp_mutex_lock (&thr->ts.work_share->lock);
     757    ret = gomp_iter_guided_next_locked (istart, iend);
     758    if (ret)
     759      gomp_ordered_next ();
     760    else
     761      gomp_ordered_last ();
     762    gomp_mutex_unlock (&thr->ts.work_share->lock);
     763  
     764    return ret;
     765  }
     766  
     767  bool
     768  GOMP_loop_ordered_runtime_next (long *istart, long *iend)
     769  {
     770    struct gomp_thread *thr = gomp_thread ();
     771  
     772    switch (thr->ts.work_share->sched)
     773      {
     774      case GFS_STATIC:
     775      case GFS_AUTO:
     776        return gomp_loop_ordered_static_next (istart, iend);
     777      case GFS_DYNAMIC:
     778        return gomp_loop_ordered_dynamic_next (istart, iend);
     779      case GFS_GUIDED:
     780        return gomp_loop_ordered_guided_next (istart, iend);
     781      default:
     782        abort ();
     783      }
     784  }
     785  
     786  /* The GOMP_parallel_loop_* routines pre-initialize a work-share construct
     787     to avoid one synchronization once we get into the loop.  */
     788  
     789  static void
     790  gomp_parallel_loop_start (void (*fn) (void *), void *data,
     791  			  unsigned num_threads, long start, long end,
     792  			  long incr, enum gomp_schedule_type sched,
     793  			  long chunk_size, unsigned int flags)
     794  {
     795    struct gomp_team *team;
     796  
     797    num_threads = gomp_resolve_num_threads (num_threads, 0);
     798    team = gomp_new_team (num_threads);
     799    gomp_loop_init (&team->work_shares[0], start, end, incr, sched, chunk_size);
     800    gomp_team_start (fn, data, num_threads, flags, team, NULL);
     801  }
     802  
     803  void
     804  GOMP_parallel_loop_static_start (void (*fn) (void *), void *data,
     805  				 unsigned num_threads, long start, long end,
     806  				 long incr, long chunk_size)
     807  {
     808    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     809  			    GFS_STATIC, chunk_size, 0);
     810  }
     811  
     812  void
     813  GOMP_parallel_loop_dynamic_start (void (*fn) (void *), void *data,
     814  				  unsigned num_threads, long start, long end,
     815  				  long incr, long chunk_size)
     816  {
     817    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     818  			    GFS_DYNAMIC, chunk_size, 0);
     819  }
     820  
     821  void
     822  GOMP_parallel_loop_guided_start (void (*fn) (void *), void *data,
     823  				 unsigned num_threads, long start, long end,
     824  				 long incr, long chunk_size)
     825  {
     826    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     827  			    GFS_GUIDED, chunk_size, 0);
     828  }
     829  
     830  void
     831  GOMP_parallel_loop_runtime_start (void (*fn) (void *), void *data,
     832  				  unsigned num_threads, long start, long end,
     833  				  long incr)
     834  {
     835    struct gomp_task_icv *icv = gomp_icv (false);
     836    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     837  			    icv->run_sched_var & ~GFS_MONOTONIC,
     838  			    icv->run_sched_chunk_size, 0);
     839  }
     840  
     841  ialias_redirect (GOMP_parallel_end)
     842  
     843  void
     844  GOMP_parallel_loop_static (void (*fn) (void *), void *data,
     845  			   unsigned num_threads, long start, long end,
     846  			   long incr, long chunk_size, unsigned flags)
     847  {
     848    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     849  			    GFS_STATIC, chunk_size, flags);
     850    fn (data);
     851    GOMP_parallel_end ();
     852  }
     853  
     854  void
     855  GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
     856  			    unsigned num_threads, long start, long end,
     857  			    long incr, long chunk_size, unsigned flags)
     858  {
     859    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     860  			    GFS_DYNAMIC, chunk_size, flags);
     861    fn (data);
     862    GOMP_parallel_end ();
     863  }
     864  
     865  void
     866  GOMP_parallel_loop_guided (void (*fn) (void *), void *data,
     867  			  unsigned num_threads, long start, long end,
     868  			  long incr, long chunk_size, unsigned flags)
     869  {
     870    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     871  			    GFS_GUIDED, chunk_size, flags);
     872    fn (data);
     873    GOMP_parallel_end ();
     874  }
     875  
     876  void
     877  GOMP_parallel_loop_runtime (void (*fn) (void *), void *data,
     878  			    unsigned num_threads, long start, long end,
     879  			    long incr, unsigned flags)
     880  {
     881    struct gomp_task_icv *icv = gomp_icv (false);
     882    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     883  			    icv->run_sched_var & ~GFS_MONOTONIC,
     884  			    icv->run_sched_chunk_size, flags);
     885    fn (data);
     886    GOMP_parallel_end ();
     887  }
     888  
     889  #ifdef HAVE_ATTRIBUTE_ALIAS
     890  extern __typeof(GOMP_parallel_loop_dynamic) GOMP_parallel_loop_nonmonotonic_dynamic
     891  	__attribute__((alias ("GOMP_parallel_loop_dynamic")));
     892  extern __typeof(GOMP_parallel_loop_guided) GOMP_parallel_loop_nonmonotonic_guided
     893  	__attribute__((alias ("GOMP_parallel_loop_guided")));
     894  extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_nonmonotonic_runtime
     895  	__attribute__((alias ("GOMP_parallel_loop_runtime")));
     896  extern __typeof(GOMP_parallel_loop_runtime) GOMP_parallel_loop_maybe_nonmonotonic_runtime
     897  	__attribute__((alias ("GOMP_parallel_loop_runtime")));
     898  #else
     899  void
     900  GOMP_parallel_loop_nonmonotonic_dynamic (void (*fn) (void *), void *data,
     901  					 unsigned num_threads, long start,
     902  					 long end, long incr, long chunk_size,
     903  					 unsigned flags)
     904  {
     905    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     906  			    GFS_DYNAMIC, chunk_size, flags);
     907    fn (data);
     908    GOMP_parallel_end ();
     909  }
     910  
     911  void
     912  GOMP_parallel_loop_nonmonotonic_guided (void (*fn) (void *), void *data,
     913  					unsigned num_threads, long start,
     914  					long end, long incr, long chunk_size,
     915  					unsigned flags)
     916  {
     917    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     918  			    GFS_GUIDED, chunk_size, flags);
     919    fn (data);
     920    GOMP_parallel_end ();
     921  }
     922  
     923  void
     924  GOMP_parallel_loop_nonmonotonic_runtime (void (*fn) (void *), void *data,
     925  					 unsigned num_threads, long start,
     926  					 long end, long incr, unsigned flags)
     927  {
     928    struct gomp_task_icv *icv = gomp_icv (false);
     929    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     930  			    icv->run_sched_var & ~GFS_MONOTONIC,
     931  			    icv->run_sched_chunk_size, flags);
     932    fn (data);
     933    GOMP_parallel_end ();
     934  }
     935  
     936  void
     937  GOMP_parallel_loop_maybe_nonmonotonic_runtime (void (*fn) (void *), void *data,
     938  					       unsigned num_threads, long start,
     939  					       long end, long incr,
     940  					       unsigned flags)
     941  {
     942    struct gomp_task_icv *icv = gomp_icv (false);
     943    gomp_parallel_loop_start (fn, data, num_threads, start, end, incr,
     944  			    icv->run_sched_var & ~GFS_MONOTONIC,
     945  			    icv->run_sched_chunk_size, flags);
     946    fn (data);
     947    GOMP_parallel_end ();
     948  }
     949  #endif
     950  
     951  /* The GOMP_loop_end* routines are called after the thread is told that
     952     all loop iterations are complete.  The first two versions synchronize
     953     all threads; the nowait version does not.  */
     954  
     955  void
     956  GOMP_loop_end (void)
     957  {
     958    gomp_work_share_end ();
     959  }
     960  
     961  bool
     962  GOMP_loop_end_cancel (void)
     963  {
     964    return gomp_work_share_end_cancel ();
     965  }
     966  
     967  void
     968  GOMP_loop_end_nowait (void)
     969  {
     970    gomp_work_share_end_nowait ();
     971  }
     972  
     973  
     974  /* We use static functions above so that we're sure that the "runtime"
     975     function can defer to the proper routine without interposition.  We
     976     export the static function with a strong alias when possible, or with
     977     a wrapper function otherwise.  */
     978  
     979  #ifdef HAVE_ATTRIBUTE_ALIAS
     980  extern __typeof(gomp_loop_static_start) GOMP_loop_static_start
     981  	__attribute__((alias ("gomp_loop_static_start")));
     982  extern __typeof(gomp_loop_dynamic_start) GOMP_loop_dynamic_start
     983  	__attribute__((alias ("gomp_loop_dynamic_start")));
     984  extern __typeof(gomp_loop_guided_start) GOMP_loop_guided_start
     985  	__attribute__((alias ("gomp_loop_guided_start")));
     986  extern __typeof(gomp_loop_dynamic_start) GOMP_loop_nonmonotonic_dynamic_start
     987  	__attribute__((alias ("gomp_loop_dynamic_start")));
     988  extern __typeof(gomp_loop_guided_start) GOMP_loop_nonmonotonic_guided_start
     989  	__attribute__((alias ("gomp_loop_guided_start")));
     990  extern __typeof(GOMP_loop_runtime_start) GOMP_loop_nonmonotonic_runtime_start
     991  	__attribute__((alias ("GOMP_loop_runtime_start")));
     992  extern __typeof(GOMP_loop_runtime_start) GOMP_loop_maybe_nonmonotonic_runtime_start
     993  	__attribute__((alias ("GOMP_loop_runtime_start")));
     994  
     995  extern __typeof(gomp_loop_ordered_static_start) GOMP_loop_ordered_static_start
     996  	__attribute__((alias ("gomp_loop_ordered_static_start")));
     997  extern __typeof(gomp_loop_ordered_dynamic_start) GOMP_loop_ordered_dynamic_start
     998  	__attribute__((alias ("gomp_loop_ordered_dynamic_start")));
     999  extern __typeof(gomp_loop_ordered_guided_start) GOMP_loop_ordered_guided_start
    1000  	__attribute__((alias ("gomp_loop_ordered_guided_start")));
    1001  
    1002  extern __typeof(gomp_loop_doacross_static_start) GOMP_loop_doacross_static_start
    1003  	__attribute__((alias ("gomp_loop_doacross_static_start")));
    1004  extern __typeof(gomp_loop_doacross_dynamic_start) GOMP_loop_doacross_dynamic_start
    1005  	__attribute__((alias ("gomp_loop_doacross_dynamic_start")));
    1006  extern __typeof(gomp_loop_doacross_guided_start) GOMP_loop_doacross_guided_start
    1007  	__attribute__((alias ("gomp_loop_doacross_guided_start")));
    1008  
    1009  extern __typeof(gomp_loop_static_next) GOMP_loop_static_next
    1010  	__attribute__((alias ("gomp_loop_static_next")));
    1011  extern __typeof(gomp_loop_dynamic_next) GOMP_loop_dynamic_next
    1012  	__attribute__((alias ("gomp_loop_dynamic_next")));
    1013  extern __typeof(gomp_loop_guided_next) GOMP_loop_guided_next
    1014  	__attribute__((alias ("gomp_loop_guided_next")));
    1015  extern __typeof(gomp_loop_dynamic_next) GOMP_loop_nonmonotonic_dynamic_next
    1016  	__attribute__((alias ("gomp_loop_dynamic_next")));
    1017  extern __typeof(gomp_loop_guided_next) GOMP_loop_nonmonotonic_guided_next
    1018  	__attribute__((alias ("gomp_loop_guided_next")));
    1019  extern __typeof(GOMP_loop_runtime_next) GOMP_loop_nonmonotonic_runtime_next
    1020  	__attribute__((alias ("GOMP_loop_runtime_next")));
    1021  extern __typeof(GOMP_loop_runtime_next) GOMP_loop_maybe_nonmonotonic_runtime_next
    1022  	__attribute__((alias ("GOMP_loop_runtime_next")));
    1023  
    1024  extern __typeof(gomp_loop_ordered_static_next) GOMP_loop_ordered_static_next
    1025  	__attribute__((alias ("gomp_loop_ordered_static_next")));
    1026  extern __typeof(gomp_loop_ordered_dynamic_next) GOMP_loop_ordered_dynamic_next
    1027  	__attribute__((alias ("gomp_loop_ordered_dynamic_next")));
    1028  extern __typeof(gomp_loop_ordered_guided_next) GOMP_loop_ordered_guided_next
    1029  	__attribute__((alias ("gomp_loop_ordered_guided_next")));
    1030  #else
    1031  bool
    1032  GOMP_loop_static_start (long start, long end, long incr, long chunk_size,
    1033  			long *istart, long *iend)
    1034  {
    1035    return gomp_loop_static_start (start, end, incr, chunk_size, istart, iend);
    1036  }
    1037  
    1038  bool
    1039  GOMP_loop_dynamic_start (long start, long end, long incr, long chunk_size,
    1040  			 long *istart, long *iend)
    1041  {
    1042    return gomp_loop_dynamic_start (start, end, incr, chunk_size, istart, iend);
    1043  }
    1044  
    1045  bool
    1046  GOMP_loop_guided_start (long start, long end, long incr, long chunk_size,
    1047  			long *istart, long *iend)
    1048  {
    1049    return gomp_loop_guided_start (start, end, incr, chunk_size, istart, iend);
    1050  }
    1051  
    1052  bool
    1053  GOMP_loop_nonmonotonic_dynamic_start (long start, long end, long incr,
    1054  				      long chunk_size, long *istart,
    1055  				      long *iend)
    1056  {
    1057    return gomp_loop_dynamic_start (start, end, incr, chunk_size, istart, iend);
    1058  }
    1059  
    1060  bool
    1061  GOMP_loop_nonmonotonic_guided_start (long start, long end, long incr,
    1062  				     long chunk_size, long *istart, long *iend)
    1063  {
    1064    return gomp_loop_guided_start (start, end, incr, chunk_size, istart, iend);
    1065  }
    1066  
    1067  bool
    1068  GOMP_loop_nonmonotonic_runtime_start (long start, long end, long incr,
    1069  				      long *istart, long *iend)
    1070  {
    1071    return GOMP_loop_runtime_start (start, end, incr, istart, iend);
    1072  }
    1073  
    1074  bool
    1075  GOMP_loop_maybe_nonmonotonic_runtime_start (long start, long end, long incr,
    1076  					    long *istart, long *iend)
    1077  {
    1078    return GOMP_loop_runtime_start (start, end, incr, istart, iend);
    1079  }
    1080  
    1081  bool
    1082  GOMP_loop_ordered_static_start (long start, long end, long incr,
    1083  				long chunk_size, long *istart, long *iend)
    1084  {
    1085    return gomp_loop_ordered_static_start (start, end, incr, chunk_size,
    1086  					 istart, iend);
    1087  }
    1088  
    1089  bool
    1090  GOMP_loop_ordered_dynamic_start (long start, long end, long incr,
    1091  				 long chunk_size, long *istart, long *iend)
    1092  {
    1093    return gomp_loop_ordered_dynamic_start (start, end, incr, chunk_size,
    1094  					  istart, iend);
    1095  }
    1096  
    1097  bool
    1098  GOMP_loop_ordered_guided_start (long start, long end, long incr,
    1099  				long chunk_size, long *istart, long *iend)
    1100  {
    1101    return gomp_loop_ordered_guided_start (start, end, incr, chunk_size,
    1102  					 istart, iend);
    1103  }
    1104  
    1105  bool
    1106  GOMP_loop_doacross_static_start (unsigned ncounts, long *counts,
    1107  				 long chunk_size, long *istart, long *iend)
    1108  {
    1109    return gomp_loop_doacross_static_start (ncounts, counts, chunk_size,
    1110  					  istart, iend);
    1111  }
    1112  
    1113  bool
    1114  GOMP_loop_doacross_dynamic_start (unsigned ncounts, long *counts,
    1115  				  long chunk_size, long *istart, long *iend)
    1116  {
    1117    return gomp_loop_doacross_dynamic_start (ncounts, counts, chunk_size,
    1118  					   istart, iend);
    1119  }
    1120  
    1121  bool
    1122  GOMP_loop_doacross_guided_start (unsigned ncounts, long *counts,
    1123  				 long chunk_size, long *istart, long *iend)
    1124  {
    1125    return gomp_loop_doacross_guided_start (ncounts, counts, chunk_size,
    1126  					  istart, iend);
    1127  }
    1128  
    1129  bool
    1130  GOMP_loop_static_next (long *istart, long *iend)
    1131  {
    1132    return gomp_loop_static_next (istart, iend);
    1133  }
    1134  
    1135  bool
    1136  GOMP_loop_dynamic_next (long *istart, long *iend)
    1137  {
    1138    return gomp_loop_dynamic_next (istart, iend);
    1139  }
    1140  
    1141  bool
    1142  GOMP_loop_guided_next (long *istart, long *iend)
    1143  {
    1144    return gomp_loop_guided_next (istart, iend);
    1145  }
    1146  
    1147  bool
    1148  GOMP_loop_nonmonotonic_dynamic_next (long *istart, long *iend)
    1149  {
    1150    return gomp_loop_dynamic_next (istart, iend);
    1151  }
    1152  
    1153  bool
    1154  GOMP_loop_nonmonotonic_guided_next (long *istart, long *iend)
    1155  {
    1156    return gomp_loop_guided_next (istart, iend);
    1157  }
    1158  
    1159  bool
    1160  GOMP_loop_nonmonotonic_runtime_next (long *istart, long *iend)
    1161  {
    1162    return GOMP_loop_runtime_next (istart, iend);
    1163  }
    1164  
    1165  bool
    1166  GOMP_loop_maybe_nonmonotonic_runtime_next (long *istart, long *iend)
    1167  {
    1168    return GOMP_loop_runtime_next (istart, iend);
    1169  }
    1170  
    1171  bool
    1172  GOMP_loop_ordered_static_next (long *istart, long *iend)
    1173  {
    1174    return gomp_loop_ordered_static_next (istart, iend);
    1175  }
    1176  
    1177  bool
    1178  GOMP_loop_ordered_dynamic_next (long *istart, long *iend)
    1179  {
    1180    return gomp_loop_ordered_dynamic_next (istart, iend);
    1181  }
    1182  
    1183  bool
    1184  GOMP_loop_ordered_guided_next (long *istart, long *iend)
    1185  {
    1186    return gomp_loop_ordered_guided_next (istart, iend);
    1187  }
    1188  #endif