(root)/
gcc-13.2.0/
libstdc++-v3/
include/
parallel/
balanced_quicksort.h
       1  // -*- C++ -*-
       2  
       3  // Copyright (C) 2007-2023 Free Software Foundation, Inc.
       4  //
       5  // This file is part of the GNU ISO C++ Library.  This library is free
       6  // software; you can redistribute it and/or modify it under the terms
       7  // of the GNU General Public License as published by the Free Software
       8  // Foundation; either version 3, or (at your option) any later
       9  // version.
      10  
      11  // This library is distributed in the hope that it will be useful, but
      12  // WITHOUT ANY WARRANTY; without even the implied warranty of
      13  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14  // General Public License for more details.
      15  
      16  // Under Section 7 of GPL version 3, you are granted additional
      17  // permissions described in the GCC Runtime Library Exception, version
      18  // 3.1, as published by the Free Software Foundation.
      19  
      20  // You should have received a copy of the GNU General Public License and
      21  // a copy of the GCC Runtime Library Exception along with this program;
      22  // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      23  // <http://www.gnu.org/licenses/>.
      24  
      25  /** @file parallel/balanced_quicksort.h
      26   *  @brief Implementation of a dynamically load-balanced parallel quicksort.
      27   *
      28   *  It works in-place and needs only logarithmic extra memory.
      29   *  The algorithm is similar to the one proposed in
      30   *
      31   *  P. Tsigas and Y. Zhang.
      32   *  A simple, fast parallel implementation of quicksort and
      33   *  its performance evaluation on SUN enterprise 10000.
      34   *  In 11th Euromicro Conference on Parallel, Distributed and
      35   *  Network-Based Processing, page 372, 2003.
      36   *
      37   *  This file is a GNU parallel extension to the Standard C++ Library.
      38   */
      39  
      40  // Written by Johannes Singler.
      41  
      42  #ifndef _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H
      43  #define _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H 1
      44  
      45  #include <parallel/basic_iterator.h>
      46  #include <bits/stl_algo.h>
      47  #include <bits/stl_function.h>
      48  
      49  #include <parallel/settings.h>
      50  #include <parallel/partition.h>
      51  #include <parallel/random_number.h>
      52  #include <parallel/queue.h>
      53  
      54  #if _GLIBCXX_PARALLEL_ASSERTIONS
      55  #include <parallel/checkers.h>
      56  #ifdef _GLIBCXX_HAVE_UNISTD_H
      57  #include <unistd.h>
      58  #endif
      59  #endif
      60  
      61  namespace __gnu_parallel
      62  {
      63    /** @brief Information local to one thread in the parallel quicksort run. */
      64    template<typename _RAIter>
      65      struct _QSBThreadLocal
      66      {
      67        typedef std::iterator_traits<_RAIter> _TraitsType;
      68        typedef typename _TraitsType::difference_type _DifferenceType;
      69  
      70        /** @brief Continuous part of the sequence, described by an
      71        iterator pair. */
      72        typedef std::pair<_RAIter, _RAIter> _Piece;
      73  
      74        /** @brief Initial piece to work on. */
      75        _Piece _M_initial;
      76  
      77        /** @brief Work-stealing queue. */
      78        _RestrictedBoundedConcurrentQueue<_Piece> _M_leftover_parts;
      79  
      80        /** @brief Number of threads involved in this algorithm. */
      81        _ThreadIndex _M_num_threads;
      82  
      83        /** @brief Pointer to a counter of elements left over to sort. */
      84        volatile _DifferenceType* _M_elements_leftover;
      85  
      86        /** @brief The complete sequence to sort. */
      87        _Piece _M_global;
      88  
      89        /** @brief Constructor.
      90         *  @param __queue_size size of the work-stealing queue. */
      91        _QSBThreadLocal(int __queue_size) : _M_leftover_parts(__queue_size) { }
      92      };
      93  
      94    /** @brief Balanced quicksort divide step.
      95      *  @param __begin Begin iterator of subsequence.
      96      *  @param __end End iterator of subsequence.
      97      *  @param __comp Comparator.
      98      *  @param __num_threads Number of threads that are allowed to work on
      99      *  this part.
     100      *  @pre @c (__end-__begin)>=1 */
     101    template<typename _RAIter, typename _Compare>
     102      typename std::iterator_traits<_RAIter>::difference_type
     103      __qsb_divide(_RAIter __begin, _RAIter __end,
     104  		 _Compare __comp, _ThreadIndex __num_threads)
     105      {
     106        _GLIBCXX_PARALLEL_ASSERT(__num_threads > 0);
     107  
     108        typedef std::iterator_traits<_RAIter> _TraitsType;
     109        typedef typename _TraitsType::value_type _ValueType;
     110        typedef typename _TraitsType::difference_type _DifferenceType;
     111  
     112        _RAIter __pivot_pos =
     113  	__median_of_three_iterators(__begin, __begin + (__end - __begin) / 2,
     114  				    __end  - 1, __comp);
     115  
     116  #if defined(_GLIBCXX_PARALLEL_ASSERTIONS)
     117        // Must be in between somewhere.
     118        _DifferenceType __n = __end - __begin;
     119  
     120        _GLIBCXX_PARALLEL_ASSERT((!__comp(*__pivot_pos, *__begin)
     121  				&& !__comp(*(__begin + __n / 2),
     122  					   *__pivot_pos))
     123  			       || (!__comp(*__pivot_pos, *__begin)
     124  				   && !__comp(*(__end - 1), *__pivot_pos))
     125  			       || (!__comp(*__pivot_pos, *(__begin + __n / 2))
     126  				   && !__comp(*__begin, *__pivot_pos))
     127  			       || (!__comp(*__pivot_pos, *(__begin + __n / 2))
     128  				   && !__comp(*(__end - 1), *__pivot_pos))
     129  			       || (!__comp(*__pivot_pos, *(__end - 1))
     130  				   && !__comp(*__begin, *__pivot_pos))
     131  			       || (!__comp(*__pivot_pos, *(__end - 1))
     132  				   && !__comp(*(__begin + __n / 2),
     133  					      *__pivot_pos)));
     134  #endif
     135  
     136        // Swap pivot value to end.
     137        if (__pivot_pos != (__end - 1))
     138  	std::iter_swap(__pivot_pos, __end - 1);
     139        __pivot_pos = __end - 1;
     140  
     141        __gnu_parallel::__binder2nd<_Compare, _ValueType, _ValueType, bool>
     142  	__pred(__comp, *__pivot_pos);
     143  
     144        // Divide, returning __end - __begin - 1 in the worst case.
     145        _DifferenceType __split_pos = __parallel_partition(__begin, __end - 1,
     146  							 __pred,
     147  							 __num_threads);
     148  
     149        // Swap back pivot to middle.
     150        std::iter_swap(__begin + __split_pos, __pivot_pos);
     151        __pivot_pos = __begin + __split_pos;
     152  
     153  #if _GLIBCXX_PARALLEL_ASSERTIONS
     154        _RAIter __r;
     155        for (__r = __begin; __r != __pivot_pos; ++__r)
     156  	_GLIBCXX_PARALLEL_ASSERT(__comp(*__r, *__pivot_pos));
     157        for (; __r != __end; ++__r)
     158  	_GLIBCXX_PARALLEL_ASSERT(!__comp(*__r, *__pivot_pos));
     159  #endif
     160  
     161        return __split_pos;
     162      }
     163  
     164    /** @brief Quicksort conquer step.
     165      *  @param __tls Array of thread-local storages.
     166      *  @param __begin Begin iterator of subsequence.
     167      *  @param __end End iterator of subsequence.
     168      *  @param __comp Comparator.
     169      *  @param __iam Number of the thread processing this function.
     170      *  @param __num_threads
     171      *          Number of threads that are allowed to work on this part. */
     172    template<typename _RAIter, typename _Compare>
     173      void
     174      __qsb_conquer(_QSBThreadLocal<_RAIter>** __tls,
     175  		  _RAIter __begin, _RAIter __end,
     176  		  _Compare __comp,
     177  		  _ThreadIndex __iam, _ThreadIndex __num_threads,
     178  		  bool __parent_wait)
     179      {
     180        typedef std::iterator_traits<_RAIter> _TraitsType;
     181        typedef typename _TraitsType::value_type _ValueType;
     182        typedef typename _TraitsType::difference_type _DifferenceType;
     183  
     184        _DifferenceType __n = __end - __begin;
     185  
     186        if (__num_threads <= 1 || __n <= 1)
     187  	{
     188            __tls[__iam]->_M_initial.first  = __begin;
     189            __tls[__iam]->_M_initial.second = __end;
     190  
     191            __qsb_local_sort_with_helping(__tls, __comp, __iam, __parent_wait);
     192  
     193            return;
     194  	}
     195  
     196        // Divide step.
     197        _DifferenceType __split_pos =
     198  	__qsb_divide(__begin, __end, __comp, __num_threads);
     199  
     200  #if _GLIBCXX_PARALLEL_ASSERTIONS
     201        _GLIBCXX_PARALLEL_ASSERT(0 <= __split_pos &&
     202                                 __split_pos < (__end - __begin));
     203  #endif
     204  
     205        _ThreadIndex
     206  	__num_threads_leftside = std::max<_ThreadIndex>
     207  	(1, std::min<_ThreadIndex>(__num_threads - 1, __split_pos
     208  				   * __num_threads / __n));
     209  
     210  #     pragma omp atomic
     211        *__tls[__iam]->_M_elements_leftover -= (_DifferenceType)1;
     212  
     213        // Conquer step.
     214  #     pragma omp parallel num_threads(2)
     215        {
     216  	bool __wait;
     217  	if(omp_get_num_threads() < 2)
     218            __wait = false;
     219  	else
     220            __wait = __parent_wait;
     221  
     222  #       pragma omp sections
     223  	{
     224  #         pragma omp section
     225  	  {
     226  	    __qsb_conquer(__tls, __begin, __begin + __split_pos, __comp,
     227  			  __iam, __num_threads_leftside, __wait);
     228  	    __wait = __parent_wait;
     229  	  }
     230  	  // The pivot_pos is left in place, to ensure termination.
     231  #         pragma omp section
     232  	  {
     233  	    __qsb_conquer(__tls, __begin + __split_pos + 1, __end, __comp,
     234  			  __iam + __num_threads_leftside,
     235  			  __num_threads - __num_threads_leftside, __wait);
     236  	    __wait = __parent_wait;
     237  	  }
     238  	}
     239        }
     240      }
     241  
     242    /**
     243      *  @brief Quicksort step doing load-balanced local sort.
     244      *  @param __tls Array of thread-local storages.
     245      *  @param __comp Comparator.
     246      *  @param __iam Number of the thread processing this function.
     247      */
     248    template<typename _RAIter, typename _Compare>
     249      void
     250      __qsb_local_sort_with_helping(_QSBThreadLocal<_RAIter>** __tls,
     251  				  _Compare& __comp, _ThreadIndex __iam,
     252  				  bool __wait)
     253      {
     254        typedef std::iterator_traits<_RAIter> _TraitsType;
     255        typedef typename _TraitsType::value_type _ValueType;
     256        typedef typename _TraitsType::difference_type _DifferenceType;
     257        typedef std::pair<_RAIter, _RAIter> _Piece;
     258  
     259        _QSBThreadLocal<_RAIter>& __tl = *__tls[__iam];
     260  
     261        _DifferenceType
     262  	__base_case_n = _Settings::get().sort_qsb_base_case_maximal_n;
     263        if (__base_case_n < 2)
     264  	__base_case_n = 2;
     265        _ThreadIndex __num_threads = __tl._M_num_threads;
     266  
     267        // Every thread has its own random number generator.
     268        _RandomNumber __rng(__iam + 1);
     269  
     270        _Piece __current = __tl._M_initial;
     271  
     272        _DifferenceType __elements_done = 0;
     273  #if _GLIBCXX_PARALLEL_ASSERTIONS
     274        _DifferenceType __total_elements_done = 0;
     275  #endif
     276  
     277        for (;;)
     278  	{
     279            // Invariant: __current must be a valid (maybe empty) range.
     280            _RAIter __begin = __current.first, __end = __current.second;
     281            _DifferenceType __n = __end - __begin;
     282  
     283            if (__n > __base_case_n)
     284              {
     285                // Divide.
     286                _RAIter __pivot_pos = __begin +  __rng(__n);
     287  
     288                // Swap __pivot_pos value to end.
     289                if (__pivot_pos != (__end - 1))
     290          	std::iter_swap(__pivot_pos, __end - 1);
     291                __pivot_pos = __end - 1;
     292  
     293                __gnu_parallel::__binder2nd
     294  		<_Compare, _ValueType, _ValueType, bool>
     295  		__pred(__comp, *__pivot_pos);
     296  
     297                // Divide, leave pivot unchanged in last place.
     298                _RAIter __split_pos1, __split_pos2;
     299                __split_pos1 = __gnu_sequential::partition(__begin, __end - 1,
     300  							 __pred);
     301  
     302                // Left side: < __pivot_pos; __right side: >= __pivot_pos.
     303  #if _GLIBCXX_PARALLEL_ASSERTIONS
     304                _GLIBCXX_PARALLEL_ASSERT(__begin <= __split_pos1
     305                                         && __split_pos1 < __end);
     306  #endif
     307                // Swap pivot back to middle.
     308                if (__split_pos1 != __pivot_pos)
     309          	std::iter_swap(__split_pos1, __pivot_pos);
     310                __pivot_pos = __split_pos1;
     311  
     312                // In case all elements are equal, __split_pos1 == 0.
     313                if ((__split_pos1 + 1 - __begin) < (__n >> 7)
     314  		  || (__end - __split_pos1) < (__n >> 7))
     315          	{
     316                    // Very unequal split, one part smaller than one 128th
     317                    // elements not strictly larger than the pivot.
     318                    __gnu_parallel::__unary_negate<__gnu_parallel::__binder1st
     319                      <_Compare, _ValueType, _ValueType, bool>, _ValueType>
     320                      __pred(__gnu_parallel::__binder1st
     321                  	 <_Compare, _ValueType, _ValueType, bool>
     322  			   (__comp, *__pivot_pos));
     323  
     324                    // Find other end of pivot-equal range.
     325                    __split_pos2 = __gnu_sequential::partition(__split_pos1 + 1,
     326  							     __end, __pred);
     327          	}
     328                else
     329          	// Only skip the pivot.
     330          	__split_pos2 = __split_pos1 + 1;
     331  
     332                // Elements equal to pivot are done.
     333                __elements_done += (__split_pos2 - __split_pos1);
     334  #if _GLIBCXX_PARALLEL_ASSERTIONS
     335                __total_elements_done += (__split_pos2 - __split_pos1);
     336  #endif
     337                // Always push larger part onto stack.
     338                if (((__split_pos1 + 1) - __begin) < (__end - (__split_pos2)))
     339          	{
     340                    // Right side larger.
     341                    if ((__split_pos2) != __end)
     342                      __tl._M_leftover_parts.push_front
     343  		      (std::make_pair(__split_pos2, __end));
     344  
     345                    //__current.first = __begin;    //already set anyway
     346                    __current.second = __split_pos1;
     347                    continue;
     348          	}
     349                else
     350          	{
     351                    // Left side larger.
     352                    if (__begin != __split_pos1)
     353                      __tl._M_leftover_parts.push_front(std::make_pair
     354  						      (__begin, __split_pos1));
     355  
     356                    __current.first = __split_pos2;
     357                    //__current.second = __end;     //already set anyway
     358                    continue;
     359          	}
     360              }
     361            else
     362              {
     363                __gnu_sequential::sort(__begin, __end, __comp);
     364                __elements_done += __n;
     365  #if _GLIBCXX_PARALLEL_ASSERTIONS
     366                __total_elements_done += __n;
     367  #endif
     368  
     369                // Prefer own stack, small pieces.
     370                if (__tl._M_leftover_parts.pop_front(__current))
     371          	continue;
     372  
     373  #             pragma omp atomic
     374                *__tl._M_elements_leftover -= __elements_done;
     375  
     376                __elements_done = 0;
     377  
     378  #if _GLIBCXX_PARALLEL_ASSERTIONS
     379                double __search_start = omp_get_wtime();
     380  #endif
     381  
     382                // Look for new work.
     383                bool __successfully_stolen = false;
     384                while (__wait && *__tl._M_elements_leftover > 0
     385                       && !__successfully_stolen
     386  #if _GLIBCXX_PARALLEL_ASSERTIONS
     387                        // Possible dead-lock.
     388                       && (omp_get_wtime() < (__search_start + 1.0))
     389  #endif
     390  		     )
     391          	{
     392                    _ThreadIndex __victim;
     393                    __victim = __rng(__num_threads);
     394  
     395                    // Large pieces.
     396                    __successfully_stolen = (__victim != __iam)
     397  		    && __tls[__victim]->_M_leftover_parts.pop_back(__current);
     398                    if (!__successfully_stolen)
     399                      __yield();
     400  #if !defined(__ICC) && !defined(__ECC)
     401  #                 pragma omp flush
     402  #endif
     403          	}
     404  
     405  #if _GLIBCXX_PARALLEL_ASSERTIONS
     406                if (omp_get_wtime() >= (__search_start + 1.0))
     407          	{
     408                    sleep(1);
     409                    _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime()
     410                                             < (__search_start + 1.0));
     411          	}
     412  #endif
     413                if (!__successfully_stolen)
     414          	{
     415  #if _GLIBCXX_PARALLEL_ASSERTIONS
     416                    _GLIBCXX_PARALLEL_ASSERT(*__tl._M_elements_leftover == 0);
     417  #endif
     418                    return;
     419          	}
     420              }
     421  	}
     422      }
     423  
     424    /** @brief Top-level quicksort routine.
     425      *  @param __begin Begin iterator of sequence.
     426      *  @param __end End iterator of sequence.
     427      *  @param __comp Comparator.
     428      *  @param __num_threads Number of threads that are allowed to work on
     429      *  this part.
     430      */
     431    template<typename _RAIter, typename _Compare>
     432      void
     433      __parallel_sort_qsb(_RAIter __begin, _RAIter __end,
     434  			_Compare __comp, _ThreadIndex __num_threads)
     435      {
     436        _GLIBCXX_CALL(__end - __begin)
     437  
     438        typedef std::iterator_traits<_RAIter> _TraitsType;
     439        typedef typename _TraitsType::value_type _ValueType;
     440        typedef typename _TraitsType::difference_type _DifferenceType;
     441        typedef std::pair<_RAIter, _RAIter> _Piece;
     442  
     443        typedef _QSBThreadLocal<_RAIter> _TLSType;
     444  
     445        _DifferenceType __n = __end - __begin;
     446  
     447        if (__n <= 1)
     448  	return;
     449  
     450        // At least one element per processor.
     451        if (__num_threads > __n)
     452  	__num_threads = static_cast<_ThreadIndex>(__n);
     453  
     454        // Initialize thread local storage
     455        _TLSType** __tls = new _TLSType*[__num_threads];
     456        _DifferenceType __queue_size = (__num_threads
     457  				      * (_ThreadIndex)(__rd_log2(__n) + 1));
     458        for (_ThreadIndex __t = 0; __t < __num_threads; ++__t)
     459  	__tls[__t] = new _QSBThreadLocal<_RAIter>(__queue_size);
     460  
     461        // There can never be more than ceil(__rd_log2(__n)) ranges on the
     462        // stack, because
     463        // 1. Only one processor pushes onto the stack
     464        // 2. The largest range has at most length __n
     465        // 3. Each range is larger than half of the range remaining
     466        volatile _DifferenceType __elements_leftover = __n;
     467        for (_ThreadIndex __i = 0; __i < __num_threads; ++__i)
     468  	{
     469            __tls[__i]->_M_elements_leftover = &__elements_leftover;
     470            __tls[__i]->_M_num_threads = __num_threads;
     471            __tls[__i]->_M_global = std::make_pair(__begin, __end);
     472  
     473            // Just in case nothing is left to assign.
     474            __tls[__i]->_M_initial = std::make_pair(__end, __end);
     475  	}
     476  
     477        // Main recursion call.
     478        __qsb_conquer(__tls, __begin, __begin + __n, __comp, 0,
     479  		    __num_threads, true);
     480  
     481  #if _GLIBCXX_PARALLEL_ASSERTIONS
     482        // All stack must be empty.
     483        _Piece __dummy;
     484        for (_ThreadIndex __i = 1; __i < __num_threads; ++__i)
     485  	_GLIBCXX_PARALLEL_ASSERT(
     486            !__tls[__i]->_M_leftover_parts.pop_back(__dummy));
     487  #endif
     488  
     489        for (_ThreadIndex __i = 0; __i < __num_threads; ++__i)
     490  	delete __tls[__i];
     491        delete[] __tls;
     492      }
     493  } // namespace __gnu_parallel
     494  
     495  #endif /* _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H */