1  // -*- C++ -*-
       2  //===-- parallel_backend_tbb.h --------------------------------------------===//
       3  //
       4  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
       5  // See https://llvm.org/LICENSE.txt for license information.
       6  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
       7  //
       8  //===----------------------------------------------------------------------===//
       9  
      10  #ifndef _PSTL_PARALLEL_BACKEND_TBB_H
      11  #define _PSTL_PARALLEL_BACKEND_TBB_H
      12  
      13  #include <algorithm>
      14  #include <type_traits>
      15  
      16  #include "parallel_backend_utils.h"
      17  
      18  // Bring in minimal required subset of Intel TBB
      19  #include <tbb/blocked_range.h>
      20  #include <tbb/parallel_for.h>
      21  #include <tbb/parallel_reduce.h>
      22  #include <tbb/parallel_scan.h>
      23  #include <tbb/parallel_invoke.h>
      24  #include <tbb/task_arena.h>
      25  #include <tbb/tbb_allocator.h>
      26  #include <tbb/task.h>
      27  
      28  #if TBB_INTERFACE_VERSION < 10000
      29  #    error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported.
      30  #endif
      31  
      32  namespace __pstl
      33  {
      34  namespace __tbb_backend
      35  {
      36  
      37  //! Raw memory buffer with automatic freeing and no exceptions.
      38  /** Some of our algorithms need to start with raw memory buffer,
      39  not an initialize array, because initialization/destruction
      40  would make the span be at least O(N). */
      41  // tbb::allocator can improve performance in some cases.
      42  template <typename _Tp>
      43  class __buffer
      44  {
      45      tbb::tbb_allocator<_Tp> _M_allocator;
      46      _Tp* _M_ptr;
      47      const std::size_t _M_buf_size;
      48      __buffer(const __buffer&) = delete;
      49      void
      50      operator=(const __buffer&) = delete;
      51  
      52    public:
      53      //! Try to obtain buffer of given size to store objects of _Tp type
      54      __buffer(std::size_t n) : _M_allocator(), _M_ptr(_M_allocator.allocate(n)), _M_buf_size(n) {}
      55      //! True if buffer was successfully obtained, zero otherwise.
      56      operator bool() const { return _M_ptr != NULL; }
      57      //! Return pointer to buffer, or  NULL if buffer could not be obtained.
      58      _Tp*
      59      get() const
      60      {
      61          return _M_ptr;
      62      }
      63      //! Destroy buffer
      64      ~__buffer() { _M_allocator.deallocate(_M_ptr, _M_buf_size); }
      65  };
      66  
      67  // Wrapper for tbb::task
      68  inline void
      69  __cancel_execution()
      70  {
      71  #if TBB_INTERFACE_VERSION <= 12000
      72      tbb::task::self().group()->cancel_group_execution();
      73  #else
      74      tbb::task::current_context()->cancel_group_execution();
      75  #endif
      76  }
      77  
      78  //------------------------------------------------------------------------
      79  // parallel_for
      80  //------------------------------------------------------------------------
      81  
      82  template <class _Index, class _RealBody>
      83  class __parallel_for_body
      84  {
      85    public:
      86      __parallel_for_body(const _RealBody& __body) : _M_body(__body) {}
      87      __parallel_for_body(const __parallel_for_body& __body) : _M_body(__body._M_body) {}
      88      void
      89      operator()(const tbb::blocked_range<_Index>& __range) const
      90      {
      91          _M_body(__range.begin(), __range.end());
      92      }
      93  
      94    private:
      95      _RealBody _M_body;
      96  };
      97  
      98  //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
      99  // wrapper over tbb::parallel_for
     100  template <class _ExecutionPolicy, class _Index, class _Fp>
     101  void
     102  __parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
     103  {
     104      tbb::this_task_arena::isolate([=]() {
     105          tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f));
     106      });
     107  }
     108  
     109  //! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last)
     110  // wrapper over tbb::parallel_reduce
     111  template <class _ExecutionPolicy, class _Value, class _Index, typename _RealBody, typename _Reduction>
     112  _Value
     113  __parallel_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, const _Value& __identity,
     114                    const _RealBody& __real_body, const _Reduction& __reduction)
     115  {
     116      return tbb::this_task_arena::isolate([__first, __last, &__identity, &__real_body, &__reduction]() -> _Value {
     117          return tbb::parallel_reduce(
     118              tbb::blocked_range<_Index>(__first, __last), __identity,
     119              [__real_body](const tbb::blocked_range<_Index>& __r, const _Value& __value) -> _Value {
     120                  return __real_body(__r.begin(), __r.end(), __value);
     121              },
     122              __reduction);
     123      });
     124  }
     125  
     126  //------------------------------------------------------------------------
     127  // parallel_transform_reduce
     128  //
     129  // Notation:
     130  //      r(i,j,init) returns reduction of init with reduction over [i,j)
     131  //      u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r
     132  //      c(x,y) combines values x and y that were the result of r or u
     133  //------------------------------------------------------------------------
     134  
     135  template <class _Index, class _Up, class _Tp, class _Cp, class _Rp>
     136  struct __par_trans_red_body
     137  {
     138      alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
     139      _Rp _M_brick_reduce;                           // Most likely to have non-empty layout
     140      _Up _M_u;
     141      _Cp _M_combine;
     142      bool _M_has_sum; // Put last to minimize size of class
     143      _Tp&
     144      sum()
     145      {
     146          _PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
     147          return *(_Tp*)_M_sum_storage;
     148      }
     149      __par_trans_red_body(_Up __u, _Tp __init, _Cp __c, _Rp __r)
     150          : _M_brick_reduce(__r), _M_u(__u), _M_combine(__c), _M_has_sum(true)
     151      {
     152          new (_M_sum_storage) _Tp(__init);
     153      }
     154  
     155      __par_trans_red_body(__par_trans_red_body& __left, tbb::split)
     156          : _M_brick_reduce(__left._M_brick_reduce), _M_u(__left._M_u), _M_combine(__left._M_combine), _M_has_sum(false)
     157      {
     158      }
     159  
     160      ~__par_trans_red_body()
     161      {
     162          // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
     163          if (_M_has_sum)
     164              sum().~_Tp();
     165      }
     166  
     167      void
     168      join(__par_trans_red_body& __rhs)
     169      {
     170          sum() = _M_combine(sum(), __rhs.sum());
     171      }
     172  
     173      void
     174      operator()(const tbb::blocked_range<_Index>& __range)
     175      {
     176          _Index __i = __range.begin();
     177          _Index __j = __range.end();
     178          if (!_M_has_sum)
     179          {
     180              _PSTL_ASSERT_MSG(__range.size() > 1, "there should be at least 2 elements");
     181              new (&_M_sum_storage)
     182                  _Tp(_M_combine(_M_u(__i), _M_u(__i + 1))); // The condition i+1 < j is provided by the grain size of 3
     183              _M_has_sum = true;
     184              std::advance(__i, 2);
     185              if (__i == __j)
     186                  return;
     187          }
     188          sum() = _M_brick_reduce(__i, __j, sum());
     189      }
     190  };
     191  
     192  template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp>
     193  _Tp
     194  __parallel_transform_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, _Up __u, _Tp __init, _Cp __combine,
     195                              _Rp __brick_reduce)
     196  {
     197      __tbb_backend::__par_trans_red_body<_Index, _Up, _Tp, _Cp, _Rp> __body(__u, __init, __combine, __brick_reduce);
     198      // The grain size of 3 is used in order to provide mininum 2 elements for each body
     199      tbb::this_task_arena::isolate(
     200          [__first, __last, &__body]() { tbb::parallel_reduce(tbb::blocked_range<_Index>(__first, __last, 3), __body); });
     201      return __body.sum();
     202  }
     203  
     204  //------------------------------------------------------------------------
     205  // parallel_scan
     206  //------------------------------------------------------------------------
     207  
     208  template <class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
     209  class __trans_scan_body
     210  {
     211      alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true
     212      _Rp _M_brick_reduce;                           // Most likely to have non-empty layout
     213      _Up _M_u;
     214      _Cp _M_combine;
     215      _Sp _M_scan;
     216      bool _M_has_sum; // Put last to minimize size of class
     217    public:
     218      __trans_scan_body(_Up __u, _Tp __init, _Cp __combine, _Rp __reduce, _Sp __scan)
     219          : _M_brick_reduce(__reduce), _M_u(__u), _M_combine(__combine), _M_scan(__scan), _M_has_sum(true)
     220      {
     221          new (_M_sum_storage) _Tp(__init);
     222      }
     223  
     224      __trans_scan_body(__trans_scan_body& __b, tbb::split)
     225          : _M_brick_reduce(__b._M_brick_reduce), _M_u(__b._M_u), _M_combine(__b._M_combine), _M_scan(__b._M_scan),
     226            _M_has_sum(false)
     227      {
     228      }
     229  
     230      ~__trans_scan_body()
     231      {
     232          // 17.6.5.12 tells us to not worry about catching exceptions from destructors.
     233          if (_M_has_sum)
     234              sum().~_Tp();
     235      }
     236  
     237      _Tp&
     238      sum() const
     239      {
     240          _PSTL_ASSERT_MSG(_M_has_sum, "sum expected");
     241          return *const_cast<_Tp*>(reinterpret_cast<_Tp const*>(_M_sum_storage));
     242      }
     243  
     244      void
     245      operator()(const tbb::blocked_range<_Index>& __range, tbb::pre_scan_tag)
     246      {
     247          _Index __i = __range.begin();
     248          _Index __j = __range.end();
     249          if (!_M_has_sum)
     250          {
     251              new (&_M_sum_storage) _Tp(_M_u(__i));
     252              _M_has_sum = true;
     253              ++__i;
     254              if (__i == __j)
     255                  return;
     256          }
     257          sum() = _M_brick_reduce(__i, __j, sum());
     258      }
     259  
     260      void
     261      operator()(const tbb::blocked_range<_Index>& __range, tbb::final_scan_tag)
     262      {
     263          sum() = _M_scan(__range.begin(), __range.end(), sum());
     264      }
     265  
     266      void
     267      reverse_join(__trans_scan_body& __a)
     268      {
     269          if (_M_has_sum)
     270          {
     271              sum() = _M_combine(__a.sum(), sum());
     272          }
     273          else
     274          {
     275              new (&_M_sum_storage) _Tp(__a.sum());
     276              _M_has_sum = true;
     277          }
     278      }
     279  
     280      void
     281      assign(__trans_scan_body& __b)
     282      {
     283          sum() = __b.sum();
     284      }
     285  };
     286  
     287  template <typename _Index>
     288  _Index
     289  __split(_Index __m)
     290  {
     291      _Index __k = 1;
     292      while (2 * __k < __m)
     293          __k *= 2;
     294      return __k;
     295  }
     296  
     297  //------------------------------------------------------------------------
     298  // __parallel_strict_scan
     299  //------------------------------------------------------------------------
     300  
     301  template <typename _Index, typename _Tp, typename _Rp, typename _Cp>
     302  void
     303  __upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine)
     304  {
     305      if (__m == 1)
     306          __r[0] = __reduce(__i * __tilesize, __lastsize);
     307      else
     308      {
     309          _Index __k = __split(__m);
     310          tbb::parallel_invoke(
     311              [=] { __tbb_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); },
     312              [=] {
     313                  __tbb_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine);
     314              });
     315          if (__m == 2 * __k)
     316              __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]);
     317      }
     318  }
     319  
     320  template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
     321  void
     322  __downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
     323              _Sp __scan)
     324  {
     325      if (__m == 1)
     326          __scan(__i * __tilesize, __lastsize, __initial);
     327      else
     328      {
     329          const _Index __k = __split(__m);
     330          tbb::parallel_invoke(
     331              [=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); },
     332              // Assumes that __combine never throws.
     333              //TODO: Consider adding a requirement for user functors to be constant.
     334              [=, &__combine] {
     335                  __tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
     336                                             __combine(__initial, __r[__k - 1]), __combine, __scan);
     337              });
     338      }
     339  }
     340  
     341  // Adapted from Intel(R) Cilk(TM) version from cilkpub.
     342  // Let i:len denote a counted interval of length n starting at i.  s denotes a generalized-sum value.
     343  // Expected actions of the functors are:
     344  //     reduce(i,len) -> s  -- return reduction value of i:len.
     345  //     combine(s1,s2) -> s -- return merged sum
     346  //     apex(s) -- do any processing necessary between reduce and scan.
     347  //     scan(i,len,initial) -- perform scan over i:len starting with initial.
     348  // The initial range 0:n is partitioned into consecutive subranges.
     349  // reduce and scan are each called exactly once per subrange.
     350  // Thus callers can rely upon side effects in reduce.
     351  // combine must not throw an exception.
     352  // apex is called exactly once, after all calls to reduce and before all calls to scan.
     353  // For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values.
     354  // T must have a trivial constructor and destructor.
     355  template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
     356  void
     357  __parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan,
     358                         _Ap __apex)
     359  {
     360      tbb::this_task_arena::isolate([=, &__combine]() {
     361          if (__n > 1)
     362          {
     363              _Index __p = tbb::this_task_arena::max_concurrency();
     364              const _Index __slack = 4;
     365              _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
     366              _Index __m = (__n - 1) / __tilesize;
     367              __buffer<_Tp> __buf(__m + 1);
     368              _Tp* __r = __buf.get();
     369              __tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce,
     370                                       __combine);
     371  
     372              // When __apex is a no-op and __combine has no side effects, a good optimizer
     373              // should be able to eliminate all code between here and __apex.
     374              // Alternatively, provide a default value for __apex that can be
     375              // recognized by metaprogramming that conditionlly executes the following.
     376              size_t __k = __m + 1;
     377              _Tp __t = __r[__k - 1];
     378              while ((__k &= __k - 1))
     379                  __t = __combine(__r[__k - 1], __t);
     380              __apex(__combine(__initial, __t));
     381              __tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial,
     382                                         __combine, __scan);
     383              return;
     384          }
     385          // Fewer than 2 elements in sequence, or out of memory.  Handle has single block.
     386          _Tp __sum = __initial;
     387          if (__n)
     388              __sum = __combine(__sum, __reduce(_Index(0), __n));
     389          __apex(__sum);
     390          if (__n)
     391              __scan(_Index(0), __n, __initial);
     392      });
     393  }
     394  
     395  template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
     396  _Tp
     397  __parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce,
     398                            _Sp __scan)
     399  {
     400      __trans_scan_body<_Index, _Up, _Tp, _Cp, _Rp, _Sp> __body(__u, __init, __combine, __brick_reduce, __scan);
     401      auto __range = tbb::blocked_range<_Index>(0, __n);
     402      tbb::this_task_arena::isolate([__range, &__body]() { tbb::parallel_scan(__range, __body); });
     403      return __body.sum();
     404  }
     405  
     406  //------------------------------------------------------------------------
     407  // parallel_stable_sort
     408  //------------------------------------------------------------------------
     409  
     410  //------------------------------------------------------------------------
     411  // stable_sort utilities
     412  //
     413  // These are used by parallel implementations but do not depend on them.
     414  //------------------------------------------------------------------------
     415  #define _PSTL_MERGE_CUT_OFF 2000
     416  
     417  template <typename _Func>
     418  class __func_task;
     419  template <typename _Func>
     420  class __root_task;
     421  
     422  #if TBB_INTERFACE_VERSION <= 12000
     423  class __task : public tbb::task
     424  {
     425    public:
     426      template <typename _Fn>
     427      __task*
     428      make_continuation(_Fn&& __f)
     429      {
     430          return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
     431      }
     432  
     433      template <typename _Fn>
     434      __task*
     435      make_child_of(__task* parent, _Fn&& __f)
     436      {
     437          return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
     438      }
     439  
     440      template <typename _Fn>
     441      __task*
     442      make_additional_child_of(tbb::task* parent, _Fn&& __f)
     443      {
     444          return new (tbb::task::allocate_additional_child_of(*parent))
     445              __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f));
     446      }
     447  
     448      inline void
     449      recycle_as_continuation()
     450      {
     451          tbb::task::recycle_as_continuation();
     452      }
     453  
     454      inline void
     455      recycle_as_child_of(__task* parent)
     456      {
     457          tbb::task::recycle_as_child_of(*parent);
     458      }
     459  
     460      inline void
     461      spawn(__task* __t)
     462      {
     463          tbb::task::spawn(*__t);
     464      }
     465  
     466      template <typename _Fn>
     467      static inline void
     468      spawn_root_and_wait(__root_task<_Fn>& __root)
     469      {
     470          tbb::task::spawn_root_and_wait(*__root._M_task);
     471      }
     472  };
     473  
     474  template <typename _Func>
     475  class __func_task : public __task
     476  {
     477      _Func _M_func;
     478  
     479      tbb::task*
     480      execute()
     481      {
     482          return _M_func(this);
     483      };
     484  
     485    public:
     486      template <typename _Fn>
     487      __func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)}
     488      {
     489      }
     490  
     491      _Func&
     492      body()
     493      {
     494          return _M_func;
     495      }
     496  };
     497  
     498  template <typename _Func>
     499  class __root_task
     500  {
     501      tbb::task* _M_task;
     502  
     503    public:
     504      template <typename... Args>
     505      __root_task(Args&&... args)
     506          : _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}}
     507      {
     508      }
     509  
     510      friend class __task;
     511      friend class __func_task<_Func>;
     512  };
     513  
     514  #else  // TBB_INTERFACE_VERSION <= 12000
     515  class __task : public tbb::detail::d1::task
     516  {
     517    protected:
     518      tbb::detail::d1::small_object_allocator _M_allocator{};
     519      tbb::detail::d1::execution_data* _M_execute_data{};
     520      __task* _M_parent{};
     521      std::atomic<int> _M_refcount{};
     522      bool _M_recycle{};
     523  
     524      template <typename _Fn>
     525      __task*
     526      allocate_func_task(_Fn&& __f)
     527      {
     528          _PSTL_ASSERT(_M_execute_data != nullptr);
     529          tbb::detail::d1::small_object_allocator __alloc{};
     530          auto __t =
     531              __alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f));
     532          __t->_M_allocator = __alloc;
     533          return __t;
     534      }
     535  
     536    public:
     537      __task*
     538      parent()
     539      {
     540          return _M_parent;
     541      }
     542  
     543      void
     544      set_ref_count(int __n)
     545      {
     546          _M_refcount.store(__n, std::memory_order_release);
     547      }
     548  
     549      template <typename _Fn>
     550      __task*
     551      make_continuation(_Fn&& __f)
     552      {
     553          auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
     554          __t->_M_parent = _M_parent;
     555          _M_parent = nullptr;
     556          return __t;
     557      }
     558  
     559      template <typename _Fn>
     560      __task*
     561      make_child_of(__task* __parent, _Fn&& __f)
     562      {
     563          auto __t = allocate_func_task(std::forward<_Fn&&>(__f));
     564          __t->_M_parent = __parent;
     565          return __t;
     566      }
     567  
     568      template <typename _Fn>
     569      __task*
     570      make_additional_child_of(__task* __parent, _Fn&& __f)
     571      {
     572          auto __t = make_child_of(__parent, std::forward<_Fn>(__f));
     573          _PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
     574          ++__parent->_M_refcount;
     575          return __t;
     576      }
     577  
     578      inline void
     579      recycle_as_continuation()
     580      {
     581          _M_recycle = true;
     582      }
     583  
     584      inline void
     585      recycle_as_child_of(__task* parent)
     586      {
     587          _M_recycle = true;
     588          _M_parent = parent;
     589      }
     590  
     591      inline void
     592      spawn(__task* __t)
     593      {
     594          _PSTL_ASSERT(_M_execute_data != nullptr);
     595          tbb::detail::d1::spawn(*__t, *_M_execute_data->context);
     596      }
     597  
     598      template <typename _Fn>
     599      static inline void
     600      spawn_root_and_wait(__root_task<_Fn>& __root)
     601      {
     602          tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object,
     603                                            __root._M_context);
     604      }
     605  
     606      template <typename _Func>
     607      friend class __func_task;
     608  };
     609  
     610  template <typename _Func>
     611  class __func_task : public __task
     612  {
     613      _Func _M_func;
     614  
     615      __task*
     616      execute(tbb::detail::d1::execution_data& __ed) override
     617      {
     618          _M_execute_data = &__ed;
     619          _M_recycle = false;
     620          __task* __next = _M_func(this);
     621          return finalize(__next);
     622      };
     623  
     624      __task*
     625      cancel(tbb::detail::d1::execution_data& __ed) override
     626      {
     627          return finalize(nullptr);
     628      }
     629  
     630      __task*
     631      finalize(__task* __next)
     632      {
     633          bool __recycle = _M_recycle;
     634          _M_recycle = false;
     635  
     636          if (__recycle)
     637          {
     638              return __next;
     639          }
     640  
     641          auto __parent = _M_parent;
     642          auto __alloc = _M_allocator;
     643          auto __ed = _M_execute_data;
     644  
     645          this->~__func_task();
     646  
     647          _PSTL_ASSERT(__parent != nullptr);
     648          _PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0);
     649          if (--__parent->_M_refcount == 0)
     650          {
     651              _PSTL_ASSERT(__next == nullptr);
     652              __alloc.deallocate(this, *__ed);
     653              return __parent;
     654          }
     655  
     656          return __next;
     657      }
     658  
     659      friend class __root_task<_Func>;
     660  
     661    public:
     662      template <typename _Fn>
     663      __func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f))
     664      {
     665      }
     666  
     667      _Func&
     668      body()
     669      {
     670          return _M_func;
     671      }
     672  };
     673  
     674  template <typename _Func>
     675  class __root_task : public __task
     676  {
     677      __task*
     678      execute(tbb::detail::d1::execution_data& __ed) override
     679      {
     680          _M_wait_object.release();
     681          return nullptr;
     682      };
     683  
     684      __task*
     685      cancel(tbb::detail::d1::execution_data& __ed) override
     686      {
     687          _M_wait_object.release();
     688          return nullptr;
     689      }
     690  
     691      __func_task<_Func>* _M_func_task{};
     692      tbb::detail::d1::wait_context _M_wait_object{0};
     693      tbb::task_group_context _M_context{};
     694  
     695    public:
     696      template <typename... Args>
     697      __root_task(Args&&... args) : _M_wait_object{1}
     698      {
     699          tbb::detail::d1::small_object_allocator __alloc{};
     700          _M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...));
     701          _M_func_task->_M_allocator = __alloc;
     702          _M_func_task->_M_parent = this;
     703          _M_refcount.store(1, std::memory_order_relaxed);
     704      }
     705  
     706      friend class __task;
     707  };
     708  #endif // TBB_INTERFACE_VERSION <= 12000
     709  
     710  template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup,
     711            typename _LeafMerge>
     712  class __merge_func
     713  {
     714      typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
     715      typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
     716      typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
     717      typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType;
     718  
     719      _RandomAccessIterator1 _M_x_beg;
     720      _RandomAccessIterator2 _M_z_beg;
     721  
     722      _SizeType _M_xs, _M_xe;
     723      _SizeType _M_ys, _M_ye;
     724      _SizeType _M_zs;
     725      _Compare _M_comp;
     726      _LeafMerge _M_leaf_merge;
     727      _SizeType _M_nsort; //number of elements to be sorted for partial_sort alforithm
     728  
     729      static const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
     730  
     731      bool _root;   //means a task is merging root task
     732      bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer
     733      bool _y_orig; //"true" means Y(or right) subrange is in the original container; false - in the buffer
     734      bool _split; //"true" means a merge task is a split task for parallel merging, the execution logic differs
     735  
     736      bool
     737      is_partial() const
     738      {
     739          return _M_nsort > 0;
     740      }
     741  
     742      struct __move_value
     743      {
     744          template <typename Iterator1, typename Iterator2>
     745          void
     746          operator()(Iterator1 __x, Iterator2 __z)
     747          {
     748              *__z = std::move(*__x);
     749          }
     750      };
     751  
     752      struct __move_value_construct
     753      {
     754          template <typename Iterator1, typename Iterator2>
     755          void
     756          operator()(Iterator1 __x, Iterator2 __z)
     757          {
     758              ::new (std::addressof(*__z)) _ValueType(std::move(*__x));
     759          }
     760      };
     761  
     762      struct __move_range
     763      {
     764          template <typename Iterator1, typename Iterator2>
     765          Iterator2
     766          operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
     767          {
     768              if (__last1 - __first1 < __merge_cut_off)
     769                  return std::move(__first1, __last1, __first2);
     770  
     771              auto __n = __last1 - __first1;
     772              tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
     773                                [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
     774                                    std::move(__first1 + __range.begin(), __first1 + __range.end(),
     775                                              __first2 + __range.begin());
     776                                });
     777              return __first2 + __n;
     778          }
     779      };
     780  
     781      struct __move_range_construct
     782      {
     783          template <typename Iterator1, typename Iterator2>
     784          Iterator2
     785          operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
     786          {
     787              if (__last1 - __first1 < __merge_cut_off)
     788              {
     789                  for (; __first1 != __last1; ++__first1, ++__first2)
     790                      __move_value_construct()(__first1, __first2);
     791                  return __first2;
     792              }
     793  
     794              auto __n = __last1 - __first1;
     795              tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
     796                                [__first1, __first2](const tbb::blocked_range<_SizeType>& __range) {
     797                                    for (auto i = __range.begin(); i != __range.end(); ++i)
     798                                        __move_value_construct()(__first1 + i, __first2 + i);
     799                                });
     800              return __first2 + __n;
     801          }
     802      };
     803  
     804      struct __cleanup_range
     805      {
     806          template <typename _Iterator>
     807          void
     808          operator()(_Iterator __first, _Iterator __last)
     809          {
     810              if (__last - __first < __merge_cut_off)
     811                  _Cleanup()(__first, __last);
     812              else
     813              {
     814                  auto __n = __last - __first;
     815                  tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off),
     816                                    [__first](const tbb::blocked_range<_SizeType>& __range) {
     817                                        _Cleanup()(__first + __range.begin(), __first + __range.end());
     818                                    });
     819              }
     820          }
     821      };
     822  
     823    public:
     824      __merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp,
     825                   _Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg,
     826                   _RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root)
     827          : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg),
     828            _M_comp(__comp), _M_leaf_merge(__leaf_merge), _M_nsort(__nsort), _root(__root),
     829            _x_orig(__x_orig), _y_orig(__y_orig), _split(false)
     830      {
     831      }
     832  
     833      bool
     834      is_left(_SizeType __idx) const
     835      {
     836          return _M_xs == __idx;
     837      }
     838  
     839      template <typename IndexType>
     840      void
     841      set_odd(IndexType __idx, bool __on_off)
     842      {
     843          if (is_left(__idx))
     844              _x_orig = __on_off;
     845          else
     846              _y_orig = __on_off;
     847      }
     848  
     849      __task*
     850      operator()(__task* __self);
     851  
     852    private:
     853      __merge_func*
     854      parent_merge(__task* __self) const
     855      {
     856          return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body();
     857      }
     858      bool
     859      x_less_y()
     860      {
     861          const auto __nx = (_M_xe - _M_xs);
     862          const auto __ny = (_M_ye - _M_ys);
     863          _PSTL_ASSERT(__nx > 0 && __ny > 0);
     864  
     865          _PSTL_ASSERT(_x_orig == _y_orig);
     866          _PSTL_ASSERT(!is_partial());
     867  
     868          if (_x_orig)
     869          {
     870              _PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_comp));
     871              _PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_comp));
     872              return !_M_comp(*(_M_x_beg + _M_ys), *(_M_x_beg + _M_xe - 1));
     873          }
     874  
     875          _PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
     876          _PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
     877          return !_M_comp(*(_M_z_beg + _M_zs + __nx), *(_M_z_beg + _M_zs + __nx - 1));
     878      }
     879      void
     880      move_x_range()
     881      {
     882          const auto __nx = (_M_xe - _M_xs);
     883          const auto __ny = (_M_ye - _M_ys);
     884          _PSTL_ASSERT(__nx > 0 && __ny > 0);
     885  
     886          if (_x_orig)
     887              __move_range_construct()(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_z_beg + _M_zs);
     888          else
     889          {
     890              __move_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx, _M_x_beg + _M_xs);
     891              __cleanup_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx);
     892          }
     893  
     894          _x_orig = !_x_orig;
     895      }
     896      void
     897      move_y_range()
     898      {
     899          const auto __nx = (_M_xe - _M_xs);
     900          const auto __ny = (_M_ye - _M_ys);
     901  
     902          if (_y_orig)
     903              __move_range_construct()(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs + __nx);
     904          else
     905          {
     906              __move_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny, _M_x_beg + _M_ys);
     907              __cleanup_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny);
     908          }
     909  
     910          _y_orig = !_y_orig;
     911      }
     912      __task*
     913      merge_ranges(__task* __self)
     914      {
     915          _PSTL_ASSERT(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer
     916  
     917          const auto __nx = (_M_xe - _M_xs);
     918          const auto __ny = (_M_ye - _M_ys);
     919          const auto __n = __nx + __ny;
     920  
     921          // need to merge {x} and {y}
     922          if (__n > __merge_cut_off)
     923              return split_merging(__self);
     924  
     925          //merge to buffer
     926          if (_x_orig)
     927          {
     928              _M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs,
     929                            _M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(),
     930                            __move_range_construct());
     931              _PSTL_ASSERT(parent_merge(__self)); //not root merging task
     932          }
     933          //merge to "origin"
     934          else
     935          {
     936              _PSTL_ASSERT(_x_orig == _y_orig);
     937  
     938              _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp));
     939              _PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp));
     940  
     941              const auto __nx = (_M_xe - _M_xs);
     942              const auto __ny = (_M_ye - _M_ys);
     943  
     944              _M_leaf_merge(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_x_beg + _M_zs,
     945                            _M_comp, __move_value(), __move_value(), __move_range(), __move_range());
     946  
     947              __cleanup_range()(_M_z_beg + _M_xs, _M_z_beg + _M_xe);
     948              __cleanup_range()(_M_z_beg + _M_ys, _M_z_beg + _M_ye);
     949          }
     950          return nullptr;
     951      }
     952  
     953      __task*
     954      process_ranges(__task* __self)
     955      {
     956          _PSTL_ASSERT(_x_orig == _y_orig);
     957          _PSTL_ASSERT(!_split);
     958  
     959          auto p = parent_merge(__self);
     960  
     961          if (!p)
     962          { //root merging task
     963  
     964              //optimization, just for sort algorithm, //{x} <= {y}
     965              if (!is_partial() && x_less_y()) //we have a solution
     966              {
     967                  if (!_x_orig)
     968                  {                   //we have to move the solution to the origin
     969                      move_x_range(); //parallel moving
     970                      move_y_range(); //parallel moving
     971                  }
     972                  return nullptr;
     973              }
     974              //else: if we have data in the origin,
     975              //we have to move data to the buffer for final merging into the origin.
     976              if (_x_orig)
     977              {
     978                  move_x_range(); //parallel moving
     979                  move_y_range(); //parallel moving
     980              }
     981              // need to merge {x} and {y}.
     982              return merge_ranges(__self);
     983          }
     984          //else: not root merging task (parent_merge() == NULL)
     985          //optimization, just for sort algorithm, //{x} <= {y}
     986          if (!is_partial() && x_less_y())
     987          {
     988              const auto id_range = _M_zs;
     989              p->set_odd(id_range, _x_orig);
     990              return nullptr;
     991          }
     992          //else: we have to revert "_x(y)_orig" flag of the parent merging task
     993          const auto id_range = _M_zs;
     994          p->set_odd(id_range, !_x_orig);
     995  
     996          return merge_ranges(__self);
     997      }
     998  
     999      //splitting as merge task into 2 of the same level
    1000      __task*
    1001      split_merging(__task* __self)
    1002      {
    1003          _PSTL_ASSERT(_x_orig == _y_orig);
    1004          const auto __nx = (_M_xe - _M_xs);
    1005          const auto __ny = (_M_ye - _M_ys);
    1006  
    1007          _SizeType __xm{};
    1008          _SizeType __ym{};
    1009          if (__nx < __ny)
    1010          {
    1011              __ym = _M_ys + __ny / 2;
    1012  
    1013              if (_x_orig)
    1014                  __xm = std::upper_bound(_M_x_beg + _M_xs, _M_x_beg + _M_xe, *(_M_x_beg + __ym), _M_comp) - _M_x_beg;
    1015              else
    1016                  __xm = std::upper_bound(_M_z_beg + _M_xs, _M_z_beg + _M_xe, *(_M_z_beg + __ym), _M_comp) - _M_z_beg;
    1017          }
    1018          else
    1019          {
    1020              __xm = _M_xs + __nx / 2;
    1021  
    1022              if (_y_orig)
    1023                  __ym = std::lower_bound(_M_x_beg + _M_ys, _M_x_beg + _M_ye, *(_M_x_beg + __xm), _M_comp) - _M_x_beg;
    1024              else
    1025                  __ym = std::lower_bound(_M_z_beg + _M_ys, _M_z_beg + _M_ye, *(_M_z_beg + __xm), _M_comp) - _M_z_beg;
    1026          }
    1027  
    1028          auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
    1029          __merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort,
    1030                                    _M_x_beg, _M_z_beg, _x_orig, _y_orig, _root);
    1031          __right_func._split = true;
    1032          auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func));
    1033          __self->spawn(__merge_task);
    1034          __self->recycle_as_continuation();
    1035  
    1036          _M_xe = __xm;
    1037          _M_ye = __ym;
    1038          _split = true;
    1039  
    1040          return __self;
    1041      }
    1042  };
    1043  
    1044  template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup,
    1045            typename _LeafMerge>
    1046  __task*
    1047  __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>::
    1048  operator()(__task* __self)
    1049  {
    1050      //a. split merge task into 2 of the same level; the special logic,
    1051      //without processing(process_ranges) adjacent sub-ranges x and y
    1052      if (_split)
    1053          return merge_ranges(__self);
    1054  
    1055      //b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} )
    1056  
    1057      //1. x and y are in the even buffer
    1058      //2. x and y are in the odd buffer
    1059      if (_x_orig == _y_orig)
    1060          return process_ranges(__self);
    1061  
    1062      //3. x is in even buffer, y is in the odd buffer
    1063      //4. x is in odd buffer, y is in the even buffer
    1064      if (!parent_merge(__self))
    1065      { //root merge task
    1066          if (_x_orig)
    1067              move_x_range();
    1068          else
    1069              move_y_range();
    1070      }
    1071      else
    1072      {
    1073          const _SizeType __nx = (_M_xe - _M_xs);
    1074          const _SizeType __ny = (_M_ye - _M_ys);
    1075          _PSTL_ASSERT(__nx > 0);
    1076          _PSTL_ASSERT(__nx > 0);
    1077  
    1078          if (__nx < __ny)
    1079              move_x_range();
    1080          else
    1081              move_y_range();
    1082      }
    1083  
    1084      return process_ranges(__self);
    1085  }
    1086  
    1087  template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
    1088  class __stable_sort_func
    1089  {
    1090    public:
    1091      typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
    1092      typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
    1093      typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
    1094  
    1095    private:
    1096      _RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg;
    1097      _RandomAccessIterator2 _M_zs, _M_z_beg;
    1098      _Compare _M_comp;
    1099      _LeafSort _M_leaf_sort;
    1100      bool _M_root;
    1101      _SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm
    1102  
    1103    public:
    1104      __stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs,
    1105                         bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort,
    1106                         _RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg)
    1107          : _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp),
    1108            _M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort)
    1109      {
    1110      }
    1111  
    1112      __task*
    1113      operator()(__task* __self);
    1114  };
    1115  
    1116  #define _PSTL_STABLE_SORT_CUT_OFF 500
    1117  
    1118  template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort>
    1119  __task*
    1120  __stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self)
    1121  {
    1122      typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy,
    1123                           __utils::__serial_move_merge>
    1124          _MergeTaskType;
    1125  
    1126      const _SizeType __n = _M_xe - _M_xs;
    1127      const _SizeType __nmerge = _M_nsort > 0 ? _M_nsort : __n;
    1128      const _SizeType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
    1129      if (__n <= __sort_cut_off)
    1130      {
    1131          _M_leaf_sort(_M_xs, _M_xe, _M_comp);
    1132          _PSTL_ASSERT(!_M_root);
    1133          return nullptr;
    1134      }
    1135  
    1136      const _RandomAccessIterator1 __xm = _M_xs + __n / 2;
    1137      const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs);
    1138      const _RandomAccessIterator2 __ze = _M_zs + __n;
    1139      _MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg,
    1140                                        _M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(),
    1141                                        __utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg,
    1142                                        /*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root));
    1143      auto __parent = __self->make_continuation(std::move(__m));
    1144      __parent->set_ref_count(2);
    1145      auto __right = __self->make_child_of(
    1146          __parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg));
    1147      __self->spawn(__right);
    1148      __self->recycle_as_child_of(__parent);
    1149      _M_root = false;
    1150      _M_xe = __xm;
    1151  
    1152      return __self;
    1153  }
    1154  
    1155  template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
    1156  void
    1157  __parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp,
    1158                         _LeafSort __leaf_sort, std::size_t __nsort = 0)
    1159  {
    1160      tbb::this_task_arena::isolate([=, &__nsort]() {
    1161          //sorting based on task tree and parallel merge
    1162          typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _ValueType;
    1163          typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType;
    1164          const _DifferenceType __n = __xe - __xs;
    1165          if (__nsort == __n)
    1166              __nsort = 0; // 'partial_sort' becames 'sort'
    1167  
    1168          const _DifferenceType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF;
    1169          if (__n > __sort_cut_off)
    1170          {
    1171              __buffer<_ValueType> __buf(__n);
    1172              __root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{
    1173                  __xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()};
    1174              __task::spawn_root_and_wait(__root);
    1175              return;
    1176          }
    1177          //serial sort
    1178          __leaf_sort(__xs, __xe, __comp);
    1179      });
    1180  }
    1181  
    1182  //------------------------------------------------------------------------
    1183  // parallel_merge
    1184  //------------------------------------------------------------------------
    1185  template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
    1186            typename _Compare, typename _LeafMerge>
    1187  class __merge_func_static
    1188  {
    1189      _RandomAccessIterator1 _M_xs, _M_xe;
    1190      _RandomAccessIterator2 _M_ys, _M_ye;
    1191      _RandomAccessIterator3 _M_zs;
    1192      _Compare _M_comp;
    1193      _LeafMerge _M_leaf_merge;
    1194  
    1195    public:
    1196      __merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys,
    1197                          _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
    1198                          _LeafMerge __leaf_merge)
    1199          : _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge)
    1200      {
    1201      }
    1202  
    1203      __task*
    1204      operator()(__task* __self);
    1205  };
    1206  
    1207  //TODO: consider usage of parallel_for with a custom blocked_range
    1208  template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
    1209            typename __M_Compare, typename _LeafMerge>
    1210  __task*
    1211  __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>::
    1212  operator()(__task* __self)
    1213  {
    1214      typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
    1215      typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
    1216      typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
    1217      const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys);
    1218      const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
    1219      if (__n <= __merge_cut_off)
    1220      {
    1221          _M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp);
    1222          return nullptr;
    1223      }
    1224  
    1225      _RandomAccessIterator1 __xm;
    1226      _RandomAccessIterator2 __ym;
    1227      if (_M_xe - _M_xs < _M_ye - _M_ys)
    1228      {
    1229          __ym = _M_ys + (_M_ye - _M_ys) / 2;
    1230          __xm = std::upper_bound(_M_xs, _M_xe, *__ym, _M_comp);
    1231      }
    1232      else
    1233      {
    1234          __xm = _M_xs + (_M_xe - _M_xs) / 2;
    1235          __ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp);
    1236      }
    1237      const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys));
    1238      auto __right = __self->make_additional_child_of(
    1239          __self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge));
    1240      __self->spawn(__right);
    1241      __self->recycle_as_continuation();
    1242      _M_xe = __xm;
    1243      _M_ye = __ym;
    1244  
    1245      return __self;
    1246  }
    1247  
    1248  template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
    1249            typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge>
    1250  void
    1251  __parallel_merge(_ExecutionPolicy&&, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe,
    1252                   _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
    1253                   _LeafMerge __leaf_merge)
    1254  {
    1255      typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1;
    1256      typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2;
    1257      typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType;
    1258      const _SizeType __n = (__xe - __xs) + (__ye - __ys);
    1259      const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF;
    1260      if (__n <= __merge_cut_off)
    1261      {
    1262          // Fall back on serial merge
    1263          __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
    1264      }
    1265      else
    1266      {
    1267          tbb::this_task_arena::isolate([=]() {
    1268              typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3,
    1269                                          _Compare, _LeafMerge>
    1270                  _TaskType;
    1271              __root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge};
    1272              __task::spawn_root_and_wait(__root);
    1273          });
    1274      }
    1275  }
    1276  
    1277  //------------------------------------------------------------------------
    1278  // parallel_invoke
    1279  //------------------------------------------------------------------------
    1280  template <class _ExecutionPolicy, typename _F1, typename _F2>
    1281  void
    1282  __parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2)
    1283  {
    1284      //TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future
    1285      tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); });
    1286  }
    1287  
    1288  } // namespace __tbb_backend
    1289  } // namespace __pstl
    1290  
    1291  #endif /* _PSTL_PARALLEL_BACKEND_TBB_H */