1  /* Copyright (C) 2005-2023 Free Software Foundation, Inc.
       2     Contributed by Richard Henderson <rth@redhat.com>.
       3  
       4     This file is part of the GNU Offloading and Multi Processing Library
       5     (libgomp).
       6  
       7     Libgomp is free software; you can redistribute it and/or modify it
       8     under the terms of the GNU General Public License as published by
       9     the Free Software Foundation; either version 3, or (at your option)
      10     any later version.
      11  
      12     Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
      13     WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
      14     FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
      15     more details.
      16  
      17     Under Section 7 of GPL version 3, you are granted additional
      18     permissions described in the GCC Runtime Library Exception, version
      19     3.1, as published by the Free Software Foundation.
      20  
      21     You should have received a copy of the GNU General Public License and
      22     a copy of the GCC Runtime Library Exception along with this program;
      23     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      24     <http://www.gnu.org/licenses/>.  */
      25  
      26  /* This file handles the ORDERED construct.  */
      27  
      28  #include "libgomp.h"
      29  #include <stdarg.h>
      30  #include <string.h>
      31  #include "doacross.h"
      32  
      33  
      34  /* This function is called when first allocating an iteration block.  That
      35     is, the thread is not currently on the queue.  The work-share lock must
      36     be held on entry.  */
      37  
      38  void
      39  gomp_ordered_first (void)
      40  {
      41    struct gomp_thread *thr = gomp_thread ();
      42    struct gomp_team *team = thr->ts.team;
      43    struct gomp_work_share *ws = thr->ts.work_share;
      44    unsigned index;
      45  
      46    /* Work share constructs can be orphaned.  */
      47    if (team == NULL || team->nthreads == 1)
      48      return;
      49  
      50    index = ws->ordered_cur + ws->ordered_num_used;
      51    if (index >= team->nthreads)
      52      index -= team->nthreads;
      53    ws->ordered_team_ids[index] = thr->ts.team_id;
      54  
      55    /* If this is the first and only thread in the queue, then there is
      56       no one to release us when we get to our ordered section.  Post to
      57       our own release queue now so that we won't block later.  */
      58    if (ws->ordered_num_used++ == 0)
      59      gomp_sem_post (team->ordered_release[thr->ts.team_id]);
      60  }
      61  
      62  /* This function is called when completing the last iteration block.  That
      63     is, there are no more iterations to perform and so the thread should be
      64     removed from the queue entirely.  Because of the way ORDERED blocks are
      65     managed, it follows that we currently own access to the ORDERED block,
      66     and should now pass it on to the next thread.  The work-share lock must
      67     be held on entry.  */
      68  
      69  void
      70  gomp_ordered_last (void)
      71  {
      72    struct gomp_thread *thr = gomp_thread ();
      73    struct gomp_team *team = thr->ts.team;
      74    struct gomp_work_share *ws = thr->ts.work_share;
      75    unsigned next_id;
      76  
      77    /* Work share constructs can be orphaned.  */
      78    if (team == NULL || team->nthreads == 1)
      79      return;
      80  
      81    /* We're no longer the owner.  */
      82    ws->ordered_owner = -1;
      83  
      84    /* If we're not the last thread in the queue, then wake the next.  */
      85    if (--ws->ordered_num_used > 0)
      86      {
      87        unsigned next = ws->ordered_cur + 1;
      88        if (next == team->nthreads)
      89  	next = 0;
      90        ws->ordered_cur = next;
      91  
      92        next_id = ws->ordered_team_ids[next];
      93        gomp_sem_post (team->ordered_release[next_id]);
      94      }
      95  }
      96  
      97  
      98  /* This function is called when allocating a subsequent allocation block.
      99     That is, we're done with the current iteration block and we're allocating
     100     another.  This is the logical combination of a call to gomp_ordered_last
     101     followed by a call to gomp_ordered_first.  The work-share lock must be
     102     held on entry. */
     103  
     104  void
     105  gomp_ordered_next (void)
     106  {
     107    struct gomp_thread *thr = gomp_thread ();
     108    struct gomp_team *team = thr->ts.team;
     109    struct gomp_work_share *ws = thr->ts.work_share;
     110    unsigned index, next_id;
     111  
     112    /* Work share constructs can be orphaned.  */
     113    if (team == NULL || team->nthreads == 1)
     114      return;
     115  
     116    /* We're no longer the owner.  */
     117    ws->ordered_owner = -1;
     118  
     119    /* If there's only one thread in the queue, that must be us.  */
     120    if (ws->ordered_num_used == 1)
     121      {
     122        /* We have a similar situation as in gomp_ordered_first
     123  	 where we need to post to our own release semaphore.  */
     124        gomp_sem_post (team->ordered_release[thr->ts.team_id]);
     125        return;
     126      }
     127  
     128    /* If the queue is entirely full, then we move ourself to the end of 
     129       the queue merely by incrementing ordered_cur.  Only if it's not 
     130       full do we have to write our id.  */
     131    if (ws->ordered_num_used < team->nthreads)
     132      {
     133        index = ws->ordered_cur + ws->ordered_num_used;
     134        if (index >= team->nthreads)
     135  	index -= team->nthreads;
     136        ws->ordered_team_ids[index] = thr->ts.team_id;
     137      }
     138  
     139    index = ws->ordered_cur + 1;
     140    if (index == team->nthreads)
     141      index = 0;
     142    ws->ordered_cur = index;
     143  
     144    next_id = ws->ordered_team_ids[index];
     145    gomp_sem_post (team->ordered_release[next_id]);
     146  }
     147  
     148  
     149  /* This function is called when a statically scheduled loop is first
     150     being created.  */
     151  
     152  void
     153  gomp_ordered_static_init (void)
     154  {
     155    struct gomp_thread *thr = gomp_thread ();
     156    struct gomp_team *team = thr->ts.team;
     157  
     158    if (team == NULL || team->nthreads == 1)
     159      return;
     160  
     161    gomp_sem_post (team->ordered_release[0]);
     162  }
     163  
     164  /* This function is called when a statically scheduled loop is moving to
     165     the next allocation block.  Static schedules are not first come first
     166     served like the others, so we're to move to the numerically next thread,
     167     not the next thread on a list.  The work-share lock should *not* be held
     168     on entry.  */
     169  
     170  void
     171  gomp_ordered_static_next (void)
     172  {
     173    struct gomp_thread *thr = gomp_thread ();
     174    struct gomp_team *team = thr->ts.team;
     175    struct gomp_work_share *ws = thr->ts.work_share;
     176    unsigned id = thr->ts.team_id;
     177  
     178    if (team == NULL || team->nthreads == 1)
     179      return;
     180  
     181    ws->ordered_owner = -1;
     182  
     183    /* This thread currently owns the lock.  Increment the owner.  */
     184    if (++id == team->nthreads)
     185      id = 0;
     186    ws->ordered_team_ids[0] = id;
     187    gomp_sem_post (team->ordered_release[id]);
     188  }
     189  
     190  /* This function is called when we need to assert that the thread owns the
     191     ordered section.  Due to the problem of posted-but-not-waited semaphores,
     192     this needs to happen before completing a loop iteration.  */
     193  
     194  void
     195  gomp_ordered_sync (void)
     196  {
     197    struct gomp_thread *thr = gomp_thread ();
     198    struct gomp_team *team = thr->ts.team;
     199    struct gomp_work_share *ws = thr->ts.work_share;
     200  
     201    /* Work share constructs can be orphaned.  But this clearly means that
     202       we are the only thread, and so we automatically own the section.  */
     203    if (team == NULL || team->nthreads == 1)
     204      return;
     205  
     206    /* ??? I believe it to be safe to access this data without taking the
     207       ws->lock.  The only presumed race condition is with the previous
     208       thread on the queue incrementing ordered_cur such that it points
     209       to us, concurrently with our check below.  But our team_id is
     210       already present in the queue, and the other thread will always
     211       post to our release semaphore.  So the two cases are that we will
     212       either win the race an momentarily block on the semaphore, or lose
     213       the race and find the semaphore already unlocked and so not block.
     214       Either way we get correct results.
     215       However, there is an implicit flush on entry to an ordered region,
     216       so we do need to have a barrier here.  If we were taking a lock
     217       this could be MEMMODEL_RELEASE since the acquire would be covered
     218       by the lock.  */
     219  
     220    __atomic_thread_fence (MEMMODEL_ACQ_REL);
     221    if (ws->ordered_owner != thr->ts.team_id)
     222      {
     223        gomp_sem_wait (team->ordered_release[thr->ts.team_id]);
     224        ws->ordered_owner = thr->ts.team_id;
     225      }
     226  }
     227  
     228  /* This function is called by user code when encountering the start of an
     229     ORDERED block.  We must check to see if the current thread is at the
     230     head of the queue, and if not, block.  */
     231  
     232  #ifdef HAVE_ATTRIBUTE_ALIAS
     233  extern void GOMP_ordered_start (void)
     234  	__attribute__((alias ("gomp_ordered_sync")));
     235  #else
     236  void
     237  GOMP_ordered_start (void)
     238  {
     239    gomp_ordered_sync ();
     240  }
     241  #endif
     242  
     243  /* This function is called by user code when encountering the end of an
     244     ORDERED block.  With the current ORDERED implementation there's nothing
     245     for us to do.
     246  
     247     However, the current implementation has a flaw in that it does not allow
     248     the next thread into the ORDERED section immediately after the current
     249     thread exits the ORDERED section in its last iteration.  The existence
     250     of this function allows the implementation to change.  */
     251  
     252  void
     253  GOMP_ordered_end (void)
     254  {
     255  }
     256  
     257  /* DOACROSS initialization.  */
     258  
     259  #define MAX_COLLAPSED_BITS (__SIZEOF_LONG__ * __CHAR_BIT__)
     260  
     261  void
     262  gomp_doacross_init (unsigned ncounts, long *counts, long chunk_size,
     263  		    size_t extra)
     264  {
     265    struct gomp_thread *thr = gomp_thread ();
     266    struct gomp_team *team = thr->ts.team;
     267    struct gomp_work_share *ws = thr->ts.work_share;
     268    unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
     269    unsigned long ent, num_ents, elt_sz, shift_sz;
     270    struct gomp_doacross_work_share *doacross;
     271  
     272    if (team == NULL || team->nthreads == 1)
     273      {
     274      empty:
     275        if (!extra)
     276  	ws->doacross = NULL;
     277        else
     278  	{
     279  	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
     280  	  doacross->extra = (void *) (doacross + 1);
     281  	  ws->doacross = doacross;
     282  	}
     283        return;
     284      }
     285  
     286    for (i = 0; i < ncounts; i++)
     287      {
     288        /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
     289        if (counts[i] == 0)
     290  	goto empty;
     291  
     292        if (num_bits <= MAX_COLLAPSED_BITS)
     293  	{
     294  	  unsigned int this_bits;
     295  	  if (counts[i] == 1)
     296  	    this_bits = 1;
     297  	  else
     298  	    this_bits = __SIZEOF_LONG__ * __CHAR_BIT__
     299  			- __builtin_clzl (counts[i] - 1);
     300  	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
     301  	    {
     302  	      bits[i] = this_bits;
     303  	      num_bits += this_bits;
     304  	    }
     305  	  else
     306  	    num_bits = MAX_COLLAPSED_BITS + 1;
     307  	}
     308      }
     309  
     310    if (ws->sched == GFS_STATIC)
     311      num_ents = team->nthreads;
     312    else if (ws->sched == GFS_GUIDED)
     313      num_ents = counts[0];
     314    else
     315      num_ents = (counts[0] - 1) / chunk_size + 1;
     316    if (num_bits <= MAX_COLLAPSED_BITS)
     317      {
     318        elt_sz = sizeof (unsigned long);
     319        shift_sz = ncounts * sizeof (unsigned int);
     320      }
     321    else
     322      {
     323        elt_sz = sizeof (unsigned long) * ncounts;
     324        shift_sz = 0;
     325      }
     326    elt_sz = (elt_sz + 63) & ~63UL;
     327  
     328    doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
     329  			  + shift_sz + extra);
     330    doacross->chunk_size = chunk_size;
     331    doacross->elt_sz = elt_sz;
     332    doacross->ncounts = ncounts;
     333    doacross->flattened = false;
     334    doacross->array = (unsigned char *)
     335  		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
     336  		     & ~(uintptr_t) 63);
     337    if (extra)
     338      {
     339        doacross->extra = doacross->array + num_ents * elt_sz;
     340        memset (doacross->extra, '\0', extra);
     341      }
     342    else
     343      doacross->extra = NULL;
     344    if (num_bits <= MAX_COLLAPSED_BITS)
     345      {
     346        unsigned int shift_count = 0;
     347        doacross->flattened = true;
     348        for (i = ncounts; i > 0; i--)
     349  	{
     350  	  doacross->shift_counts[i - 1] = shift_count;
     351  	  shift_count += bits[i - 1];
     352  	}
     353        for (ent = 0; ent < num_ents; ent++)
     354  	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
     355      }
     356    else
     357      for (ent = 0; ent < num_ents; ent++)
     358        memset (doacross->array + ent * elt_sz, '\0',
     359  	      sizeof (unsigned long) * ncounts);
     360    if (ws->sched == GFS_STATIC && chunk_size == 0)
     361      {
     362        unsigned long q = counts[0] / num_ents;
     363        unsigned long t = counts[0] % num_ents;
     364        doacross->boundary = t * (q + 1);
     365        doacross->q = q;
     366        doacross->t = t;
     367      }
     368    ws->doacross = doacross;
     369  }
     370  
     371  /* DOACROSS POST operation.  */
     372  
     373  void
     374  GOMP_doacross_post (long *counts)
     375  {
     376    struct gomp_thread *thr = gomp_thread ();
     377    struct gomp_work_share *ws = thr->ts.work_share;
     378    struct gomp_doacross_work_share *doacross = ws->doacross;
     379    unsigned long ent;
     380    unsigned int i;
     381  
     382    if (__builtin_expect (doacross == NULL, 0)
     383        || __builtin_expect (doacross->array == NULL, 0))
     384      {
     385        __sync_synchronize ();
     386        return;
     387      }
     388  
     389    if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     390      ent = thr->ts.team_id;
     391    else if (ws->sched == GFS_GUIDED)
     392      ent = counts[0];
     393    else
     394      ent = counts[0] / doacross->chunk_size;
     395    unsigned long *array = (unsigned long *) (doacross->array
     396  					    + ent * doacross->elt_sz);
     397  
     398    if (__builtin_expect (doacross->flattened, 1))
     399      {
     400        unsigned long flattened
     401  	= (unsigned long) counts[0] << doacross->shift_counts[0];
     402  
     403        for (i = 1; i < doacross->ncounts; i++)
     404  	flattened |= (unsigned long) counts[i]
     405  		     << doacross->shift_counts[i];
     406        flattened++;
     407        if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
     408  	__atomic_thread_fence (MEMMODEL_RELEASE);
     409        else
     410  	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
     411        return;
     412      }
     413  
     414    __atomic_thread_fence (MEMMODEL_ACQUIRE);
     415    for (i = doacross->ncounts; i-- > 0; )
     416      {
     417        if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
     418  	__atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
     419      }
     420  }
     421  
     422  /* DOACROSS WAIT operation.  */
     423  
     424  void
     425  GOMP_doacross_wait (long first, ...)
     426  {
     427    struct gomp_thread *thr = gomp_thread ();
     428    struct gomp_work_share *ws = thr->ts.work_share;
     429    struct gomp_doacross_work_share *doacross = ws->doacross;
     430    va_list ap;
     431    unsigned long ent;
     432    unsigned int i;
     433  
     434    if (__builtin_expect (doacross == NULL, 0)
     435        || __builtin_expect (doacross->array == NULL, 0))
     436      {
     437        __sync_synchronize ();
     438        return;
     439      }
     440  
     441    if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     442      {
     443        if (ws->chunk_size == 0)
     444  	{
     445  	  if (first < doacross->boundary)
     446  	    ent = first / (doacross->q + 1);
     447  	  else
     448  	    ent = (first - doacross->boundary) / doacross->q
     449  		  + doacross->t;
     450  	}
     451        else
     452  	ent = first / ws->chunk_size % thr->ts.team->nthreads;
     453      }
     454    else if (ws->sched == GFS_GUIDED)
     455      ent = first;
     456    else
     457      ent = first / doacross->chunk_size;
     458    unsigned long *array = (unsigned long *) (doacross->array
     459  					    + ent * doacross->elt_sz);
     460  
     461    if (__builtin_expect (doacross->flattened, 1))
     462      {
     463        unsigned long flattened
     464  	= (unsigned long) first << doacross->shift_counts[0];
     465        unsigned long cur;
     466  
     467        va_start (ap, first);
     468        for (i = 1; i < doacross->ncounts; i++)
     469  	flattened |= (unsigned long) va_arg (ap, long)
     470  		     << doacross->shift_counts[i];
     471        cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
     472        if (flattened < cur)
     473  	{
     474  	  __atomic_thread_fence (MEMMODEL_RELEASE);
     475  	  va_end (ap);
     476  	  return;
     477  	}
     478        doacross_spin (array, flattened, cur);
     479        __atomic_thread_fence (MEMMODEL_RELEASE);
     480        va_end (ap);
     481        return;
     482      }
     483  
     484    do
     485      {
     486        va_start (ap, first);
     487        for (i = 0; i < doacross->ncounts; i++)
     488  	{
     489  	  unsigned long thisv
     490  	    = (unsigned long) (i ? va_arg (ap, long) : first) + 1;
     491  	  unsigned long cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
     492  	  if (thisv < cur)
     493  	    {
     494  	      i = doacross->ncounts;
     495  	      break;
     496  	    }
     497  	  if (thisv > cur)
     498  	    break;
     499  	}
     500        va_end (ap);
     501        if (i == doacross->ncounts)
     502  	break;
     503        cpu_relax ();
     504      }
     505    while (1);
     506    __sync_synchronize ();
     507  }
     508  
     509  typedef unsigned long long gomp_ull;
     510  
     511  void
     512  gomp_doacross_ull_init (unsigned ncounts, gomp_ull *counts,
     513  			gomp_ull chunk_size, size_t extra)
     514  {
     515    struct gomp_thread *thr = gomp_thread ();
     516    struct gomp_team *team = thr->ts.team;
     517    struct gomp_work_share *ws = thr->ts.work_share;
     518    unsigned int i, bits[MAX_COLLAPSED_BITS], num_bits = 0;
     519    unsigned long ent, num_ents, elt_sz, shift_sz;
     520    struct gomp_doacross_work_share *doacross;
     521  
     522    if (team == NULL || team->nthreads == 1)
     523      {
     524      empty:
     525        if (!extra)
     526  	ws->doacross = NULL;
     527        else
     528  	{
     529  	  doacross = gomp_malloc_cleared (sizeof (*doacross) + extra);
     530  	  doacross->extra = (void *) (doacross + 1);
     531  	  ws->doacross = doacross;
     532  	}
     533        return;
     534      }
     535  
     536    for (i = 0; i < ncounts; i++)
     537      {
     538        /* If any count is 0, GOMP_doacross_{post,wait} can't be called.  */
     539        if (counts[i] == 0)
     540  	goto empty;
     541  
     542        if (num_bits <= MAX_COLLAPSED_BITS)
     543  	{
     544  	  unsigned int this_bits;
     545  	  if (counts[i] == 1)
     546  	    this_bits = 1;
     547  	  else
     548  	    this_bits = __SIZEOF_LONG_LONG__ * __CHAR_BIT__
     549  			- __builtin_clzll (counts[i] - 1);
     550  	  if (num_bits + this_bits <= MAX_COLLAPSED_BITS)
     551  	    {
     552  	      bits[i] = this_bits;
     553  	      num_bits += this_bits;
     554  	    }
     555  	  else
     556  	    num_bits = MAX_COLLAPSED_BITS + 1;
     557  	}
     558      }
     559  
     560    if (ws->sched == GFS_STATIC)
     561      num_ents = team->nthreads;
     562    else if (ws->sched == GFS_GUIDED)
     563      num_ents = counts[0];
     564    else
     565      num_ents = (counts[0] - 1) / chunk_size + 1;
     566    if (num_bits <= MAX_COLLAPSED_BITS)
     567      {
     568        elt_sz = sizeof (unsigned long);
     569        shift_sz = ncounts * sizeof (unsigned int);
     570      }
     571    else
     572      {
     573        if (sizeof (gomp_ull) == sizeof (unsigned long))
     574  	elt_sz = sizeof (gomp_ull) * ncounts;
     575        else if (sizeof (gomp_ull) == 2 * sizeof (unsigned long))
     576  	elt_sz = sizeof (unsigned long) * 2 * ncounts;
     577        else
     578  	abort ();
     579        shift_sz = 0;
     580      }
     581    elt_sz = (elt_sz + 63) & ~63UL;
     582  
     583    doacross = gomp_malloc (sizeof (*doacross) + 63 + num_ents * elt_sz
     584  			  + shift_sz);
     585    doacross->chunk_size_ull = chunk_size;
     586    doacross->elt_sz = elt_sz;
     587    doacross->ncounts = ncounts;
     588    doacross->flattened = false;
     589    doacross->boundary = 0;
     590    doacross->array = (unsigned char *)
     591  		    ((((uintptr_t) (doacross + 1)) + 63 + shift_sz)
     592  		     & ~(uintptr_t) 63);
     593    if (extra)
     594      {
     595        doacross->extra = doacross->array + num_ents * elt_sz;
     596        memset (doacross->extra, '\0', extra);
     597      }
     598    else
     599      doacross->extra = NULL;
     600    if (num_bits <= MAX_COLLAPSED_BITS)
     601      {
     602        unsigned int shift_count = 0;
     603        doacross->flattened = true;
     604        for (i = ncounts; i > 0; i--)
     605  	{
     606  	  doacross->shift_counts[i - 1] = shift_count;
     607  	  shift_count += bits[i - 1];
     608  	}
     609        for (ent = 0; ent < num_ents; ent++)
     610  	*(unsigned long *) (doacross->array + ent * elt_sz) = 0;
     611      }
     612    else
     613      for (ent = 0; ent < num_ents; ent++)
     614        memset (doacross->array + ent * elt_sz, '\0',
     615  	      sizeof (unsigned long) * ncounts);
     616    if (ws->sched == GFS_STATIC && chunk_size == 0)
     617      {
     618        gomp_ull q = counts[0] / num_ents;
     619        gomp_ull t = counts[0] % num_ents;
     620        doacross->boundary_ull = t * (q + 1);
     621        doacross->q_ull = q;
     622        doacross->t = t;
     623      }
     624    ws->doacross = doacross;
     625  }
     626  
     627  /* DOACROSS POST operation.  */
     628  
     629  void
     630  GOMP_doacross_ull_post (gomp_ull *counts)
     631  {
     632    struct gomp_thread *thr = gomp_thread ();
     633    struct gomp_work_share *ws = thr->ts.work_share;
     634    struct gomp_doacross_work_share *doacross = ws->doacross;
     635    unsigned long ent;
     636    unsigned int i;
     637  
     638    if (__builtin_expect (doacross == NULL, 0)
     639        || __builtin_expect (doacross->array == NULL, 0))
     640      {
     641        __sync_synchronize ();
     642        return;
     643      }
     644  
     645    if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     646      ent = thr->ts.team_id;
     647    else if (ws->sched == GFS_GUIDED)
     648      ent = counts[0];
     649    else
     650      ent = counts[0] / doacross->chunk_size_ull;
     651  
     652    if (__builtin_expect (doacross->flattened, 1))
     653      {
     654        unsigned long *array = (unsigned long *) (doacross->array
     655  			      + ent * doacross->elt_sz);
     656        gomp_ull flattened
     657  	= counts[0] << doacross->shift_counts[0];
     658  
     659        for (i = 1; i < doacross->ncounts; i++)
     660  	flattened |= counts[i] << doacross->shift_counts[i];
     661        flattened++;
     662        if (flattened == __atomic_load_n (array, MEMMODEL_ACQUIRE))
     663  	__atomic_thread_fence (MEMMODEL_RELEASE);
     664        else
     665  	__atomic_store_n (array, flattened, MEMMODEL_RELEASE);
     666        return;
     667      }
     668  
     669    __atomic_thread_fence (MEMMODEL_ACQUIRE);
     670    if (sizeof (gomp_ull) == sizeof (unsigned long))
     671      {
     672        gomp_ull *array = (gomp_ull *) (doacross->array
     673  				      + ent * doacross->elt_sz);
     674  
     675        for (i = doacross->ncounts; i-- > 0; )
     676  	{
     677  	  if (counts[i] + 1UL != __atomic_load_n (&array[i], MEMMODEL_RELAXED))
     678  	    __atomic_store_n (&array[i], counts[i] + 1UL, MEMMODEL_RELEASE);
     679  	}
     680      }
     681    else
     682      {
     683        unsigned long *array = (unsigned long *) (doacross->array
     684  						+ ent * doacross->elt_sz);
     685  
     686        for (i = doacross->ncounts; i-- > 0; )
     687  	{
     688  	  gomp_ull cull = counts[i] + 1UL;
     689  	  unsigned long c = (unsigned long) cull;
     690  	  if (c != __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED))
     691  	    __atomic_store_n (&array[2 * i + 1], c, MEMMODEL_RELEASE);
     692  	  c = cull >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
     693  	  if (c != __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED))
     694  	    __atomic_store_n (&array[2 * i], c, MEMMODEL_RELEASE);
     695  	}
     696      }
     697  }
     698  
     699  /* DOACROSS WAIT operation.  */
     700  
     701  void
     702  GOMP_doacross_ull_wait (gomp_ull first, ...)
     703  {
     704    struct gomp_thread *thr = gomp_thread ();
     705    struct gomp_work_share *ws = thr->ts.work_share;
     706    struct gomp_doacross_work_share *doacross = ws->doacross;
     707    va_list ap;
     708    unsigned long ent;
     709    unsigned int i;
     710  
     711    if (__builtin_expect (doacross == NULL, 0)
     712        || __builtin_expect (doacross->array == NULL, 0))
     713      {
     714        __sync_synchronize ();
     715        return;
     716      }
     717  
     718    if (__builtin_expect (ws->sched == GFS_STATIC, 1))
     719      {
     720        if (ws->chunk_size_ull == 0)
     721  	{
     722  	  if (first < doacross->boundary_ull)
     723  	    ent = first / (doacross->q_ull + 1);
     724  	  else
     725  	    ent = (first - doacross->boundary_ull) / doacross->q_ull
     726  		  + doacross->t;
     727  	}
     728        else
     729  	ent = first / ws->chunk_size_ull % thr->ts.team->nthreads;
     730      }
     731    else if (ws->sched == GFS_GUIDED)
     732      ent = first;
     733    else
     734      ent = first / doacross->chunk_size_ull;
     735  
     736    if (__builtin_expect (doacross->flattened, 1))
     737      {
     738        unsigned long *array = (unsigned long *) (doacross->array
     739  						+ ent * doacross->elt_sz);
     740        gomp_ull flattened = first << doacross->shift_counts[0];
     741        unsigned long cur;
     742  
     743        va_start (ap, first);
     744        for (i = 1; i < doacross->ncounts; i++)
     745  	flattened |= va_arg (ap, gomp_ull)
     746  		     << doacross->shift_counts[i];
     747        cur = __atomic_load_n (array, MEMMODEL_ACQUIRE);
     748        if (flattened < cur)
     749  	{
     750  	  __atomic_thread_fence (MEMMODEL_RELEASE);
     751  	  va_end (ap);
     752  	  return;
     753  	}
     754        doacross_spin (array, flattened, cur);
     755        __atomic_thread_fence (MEMMODEL_RELEASE);
     756        va_end (ap);
     757        return;
     758      }
     759  
     760    if (sizeof (gomp_ull) == sizeof (unsigned long))
     761      {
     762        gomp_ull *array = (gomp_ull *) (doacross->array
     763  				      + ent * doacross->elt_sz);
     764        do
     765  	{
     766  	  va_start (ap, first);
     767  	  for (i = 0; i < doacross->ncounts; i++)
     768  	    {
     769  	      gomp_ull thisv
     770  		= (i ? va_arg (ap, gomp_ull) : first) + 1;
     771  	      gomp_ull cur = __atomic_load_n (&array[i], MEMMODEL_RELAXED);
     772  	      if (thisv < cur)
     773  		{
     774  		  i = doacross->ncounts;
     775  		  break;
     776  		}
     777  	      if (thisv > cur)
     778  		break;
     779  	    }
     780  	  va_end (ap);
     781  	  if (i == doacross->ncounts)
     782  	    break;
     783  	  cpu_relax ();
     784  	}
     785        while (1);
     786      }
     787    else
     788      {
     789        unsigned long *array = (unsigned long *) (doacross->array
     790  						+ ent * doacross->elt_sz);
     791        do
     792  	{
     793  	  va_start (ap, first);
     794  	  for (i = 0; i < doacross->ncounts; i++)
     795  	    {
     796  	      gomp_ull thisv
     797  		= (i ? va_arg (ap, gomp_ull) : first) + 1;
     798  	      unsigned long t
     799  		= thisv >> (__SIZEOF_LONG_LONG__ * __CHAR_BIT__ / 2);
     800  	      unsigned long cur
     801  		= __atomic_load_n (&array[2 * i], MEMMODEL_RELAXED);
     802  	      if (t < cur)
     803  		{
     804  		  i = doacross->ncounts;
     805  		  break;
     806  		}
     807  	      if (t > cur)
     808  		break;
     809  	      t = thisv;
     810  	      cur = __atomic_load_n (&array[2 * i + 1], MEMMODEL_RELAXED);
     811  	      if (t < cur)
     812  		{
     813  		  i = doacross->ncounts;
     814  		  break;
     815  		}
     816  	      if (t > cur)
     817  		break;
     818  	    }
     819  	  va_end (ap);
     820  	  if (i == doacross->ncounts)
     821  	    break;
     822  	  cpu_relax ();
     823  	}
     824        while (1);
     825      }
     826    __sync_synchronize ();
     827  }