(root)/
gcc-13.2.0/
libstdc++-v3/
include/
pstl/
unseq_backend_simd.h
       1  // -*- C++ -*-
       2  //===-- unseq_backend_simd.h ----------------------------------------------===//
       3  //
       4  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
       5  // See https://llvm.org/LICENSE.txt for license information.
       6  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
       7  //
       8  //===----------------------------------------------------------------------===//
       9  
      10  #ifndef _PSTL_UNSEQ_BACKEND_SIMD_H
      11  #define _PSTL_UNSEQ_BACKEND_SIMD_H
      12  
      13  #include <type_traits>
      14  
      15  #include "utils.h"
      16  
      17  // This header defines the minimum set of vector routines required
      18  // to support parallel STL.
      19  namespace __pstl
      20  {
      21  namespace __unseq_backend
      22  {
      23  
      24  // Expect vector width up to 64 (or 512 bit)
      25  const std::size_t __lane_size = 64;
      26  
      27  template <class _Iterator, class _DifferenceType, class _Function>
      28  _Iterator
      29  __simd_walk_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept
      30  {
      31      _PSTL_PRAGMA_SIMD
      32      for (_DifferenceType __i = 0; __i < __n; ++__i)
      33          __f(__first[__i]);
      34  
      35      return __first + __n;
      36  }
      37  
      38  template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function>
      39  _Iterator2
      40  __simd_walk_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept
      41  {
      42      _PSTL_PRAGMA_SIMD
      43      for (_DifferenceType __i = 0; __i < __n; ++__i)
      44          __f(__first1[__i], __first2[__i]);
      45      return __first2 + __n;
      46  }
      47  
      48  template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function>
      49  _Iterator3
      50  __simd_walk_3(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3,
      51                _Function __f) noexcept
      52  {
      53      _PSTL_PRAGMA_SIMD
      54      for (_DifferenceType __i = 0; __i < __n; ++__i)
      55          __f(__first1[__i], __first2[__i], __first3[__i]);
      56      return __first3 + __n;
      57  }
      58  
      59  // TODO: check whether __simd_first() can be used here
      60  template <class _Index, class _DifferenceType, class _Pred>
      61  bool
      62  __simd_or(_Index __first, _DifferenceType __n, _Pred __pred) noexcept
      63  {
      64  #if _PSTL_EARLYEXIT_PRESENT
      65      _DifferenceType __i;
      66      _PSTL_PRAGMA_VECTOR_UNALIGNED
      67      _PSTL_PRAGMA_SIMD_EARLYEXIT
      68      for (__i = 0; __i < __n; ++__i)
      69          if (__pred(__first[__i]))
      70              break;
      71      return __i < __n;
      72  #else
      73      _DifferenceType __block_size = 4 < __n ? 4 : __n;
      74      const _Index __last = __first + __n;
      75      while (__last != __first)
      76      {
      77          __INT32_TYPE__ __flag = 1;
      78          _PSTL_PRAGMA_SIMD_REDUCTION(& : __flag)
      79          for (_DifferenceType __i = 0; __i < __block_size; ++__i)
      80              if (__pred(*(__first + __i)))
      81                  __flag = 0;
      82          if (!__flag)
      83              return true;
      84  
      85          __first += __block_size;
      86          if (__last - __first >= __block_size << 1)
      87          {
      88              // Double the block _Size.  Any unnecessary iterations can be amortized against work done so far.
      89              __block_size <<= 1;
      90          }
      91          else
      92          {
      93              __block_size = __last - __first;
      94          }
      95      }
      96      return false;
      97  #endif
      98  }
      99  
     100  template <class _Index, class _DifferenceType, class _Compare>
     101  _Index
     102  __simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept
     103  {
     104  #if _PSTL_EARLYEXIT_PRESENT
     105      _DifferenceType __i = __begin;
     106      _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
     107          _PSTL_PRAGMA_SIMD_EARLYEXIT for (; __i < __end; ++__i)
     108      {
     109          if (__comp(__first, __i))
     110          {
     111              break;
     112          }
     113      }
     114      return __first + __i;
     115  #else
     116      // Experiments show good block sizes like this
     117      const _DifferenceType __block_size = 8;
     118      alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
     119      while (__end - __begin >= __block_size)
     120      {
     121          _DifferenceType __found = 0;
     122          _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
     123              _PSTL_PRAGMA_SIMD_REDUCTION(|
     124                                          : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size;
     125                                                          ++__i)
     126          {
     127              const _DifferenceType __t = __comp(__first, __i);
     128              __lane[__i - __begin] = __t;
     129              __found |= __t;
     130          }
     131          if (__found)
     132          {
     133              _DifferenceType __i;
     134              // This will vectorize
     135              for (__i = 0; __i < __block_size; ++__i)
     136              {
     137                  if (__lane[__i])
     138                  {
     139                      break;
     140                  }
     141              }
     142              return __first + __begin + __i;
     143          }
     144          __begin += __block_size;
     145      }
     146  
     147      //Keep remainder scalar
     148      while (__begin != __end)
     149      {
     150          if (__comp(__first, __begin))
     151          {
     152              return __first + __begin;
     153          }
     154          ++__begin;
     155      }
     156      return __first + __end;
     157  #endif //_PSTL_EARLYEXIT_PRESENT
     158  }
     159  
     160  template <class _Index1, class _DifferenceType, class _Index2, class _Pred>
     161  std::pair<_Index1, _Index2>
     162  __simd_first(_Index1 __first1, _DifferenceType __n, _Index2 __first2, _Pred __pred) noexcept
     163  {
     164  #if _PSTL_EARLYEXIT_PRESENT
     165      _DifferenceType __i = 0;
     166      _PSTL_PRAGMA_VECTOR_UNALIGNED
     167      _PSTL_PRAGMA_SIMD_EARLYEXIT
     168      for (; __i < __n; ++__i)
     169          if (__pred(__first1[__i], __first2[__i]))
     170              break;
     171      return std::make_pair(__first1 + __i, __first2 + __i);
     172  #else
     173      const _Index1 __last1 = __first1 + __n;
     174      const _Index2 __last2 = __first2 + __n;
     175      // Experiments show good block sizes like this
     176      const _DifferenceType __block_size = 8;
     177      alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
     178      while (__last1 - __first1 >= __block_size)
     179      {
     180          _DifferenceType __found = 0;
     181          _DifferenceType __i;
     182          _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
     183              _PSTL_PRAGMA_SIMD_REDUCTION(|
     184                                          : __found) for (__i = 0; __i < __block_size; ++__i)
     185          {
     186              const _DifferenceType __t = __pred(__first1[__i], __first2[__i]);
     187              __lane[__i] = __t;
     188              __found |= __t;
     189          }
     190          if (__found)
     191          {
     192              _DifferenceType __i2;
     193              // This will vectorize
     194              for (__i2 = 0; __i2 < __block_size; ++__i2)
     195              {
     196                  if (__lane[__i2])
     197                      break;
     198              }
     199              return std::make_pair(__first1 + __i2, __first2 + __i2);
     200          }
     201          __first1 += __block_size;
     202          __first2 += __block_size;
     203      }
     204  
     205      //Keep remainder scalar
     206      for (; __last1 != __first1; ++__first1, ++__first2)
     207          if (__pred(*(__first1), *(__first2)))
     208              return std::make_pair(__first1, __first2);
     209  
     210      return std::make_pair(__last1, __last2);
     211  #endif //_PSTL_EARLYEXIT_PRESENT
     212  }
     213  
     214  template <class _Index, class _DifferenceType, class _Pred>
     215  _DifferenceType
     216  __simd_count(_Index __index, _DifferenceType __n, _Pred __pred) noexcept
     217  {
     218      _DifferenceType __count = 0;
     219      _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
     220      for (_DifferenceType __i = 0; __i < __n; ++__i)
     221          if (__pred(*(__index + __i)))
     222              ++__count;
     223  
     224      return __count;
     225  }
     226  
     227  template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _BinaryPredicate>
     228  _OutputIterator
     229  __simd_unique_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator __result,
     230                     _BinaryPredicate __pred) noexcept
     231  {
     232      if (__n == 0)
     233          return __result;
     234  
     235      _DifferenceType __cnt = 1;
     236      __result[0] = __first[0];
     237  
     238      _PSTL_PRAGMA_SIMD
     239      for (_DifferenceType __i = 1; __i < __n; ++__i)
     240      {
     241          _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
     242          if (!__pred(__first[__i], __first[__i - 1]))
     243          {
     244              __result[__cnt] = __first[__i];
     245              ++__cnt;
     246          }
     247      }
     248      return __result + __cnt;
     249  }
     250  
     251  template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
     252  _OutputIterator
     253  __simd_assign(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _Assigner __assigner) noexcept
     254  {
     255      _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
     256      _PSTL_PRAGMA_SIMD
     257      for (_DifferenceType __i = 0; __i < __n; ++__i)
     258          __assigner(__first + __i, __result + __i);
     259      return __result + __n;
     260  }
     261  
     262  template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _UnaryPredicate>
     263  _OutputIterator
     264  __simd_copy_if(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, _UnaryPredicate __pred) noexcept
     265  {
     266      _DifferenceType __cnt = 0;
     267  
     268      _PSTL_PRAGMA_SIMD
     269      for (_DifferenceType __i = 0; __i < __n; ++__i)
     270      {
     271          _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
     272          if (__pred(__first[__i]))
     273          {
     274              __result[__cnt] = __first[__i];
     275              ++__cnt;
     276          }
     277      }
     278      return __result + __cnt;
     279  }
     280  
     281  template <class _InputIterator, class _DifferenceType, class _BinaryPredicate>
     282  _DifferenceType
     283  __simd_calc_mask_2(_InputIterator __first, _DifferenceType __n, bool* __mask, _BinaryPredicate __pred) noexcept
     284  {
     285      _DifferenceType __count = 0;
     286  
     287      _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
     288      for (_DifferenceType __i = 0; __i < __n; ++__i)
     289      {
     290          __mask[__i] = !__pred(__first[__i], __first[__i - 1]);
     291          __count += __mask[__i];
     292      }
     293      return __count;
     294  }
     295  
     296  template <class _InputIterator, class _DifferenceType, class _UnaryPredicate>
     297  _DifferenceType
     298  __simd_calc_mask_1(_InputIterator __first, _DifferenceType __n, bool* __mask, _UnaryPredicate __pred) noexcept
     299  {
     300      _DifferenceType __count = 0;
     301  
     302      _PSTL_PRAGMA_SIMD_REDUCTION(+ : __count)
     303      for (_DifferenceType __i = 0; __i < __n; ++__i)
     304      {
     305          __mask[__i] = __pred(__first[__i]);
     306          __count += __mask[__i];
     307      }
     308      return __count;
     309  }
     310  
     311  template <class _InputIterator, class _DifferenceType, class _OutputIterator, class _Assigner>
     312  void
     313  __simd_copy_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator __result, bool* __mask,
     314                      _Assigner __assigner) noexcept
     315  {
     316      _DifferenceType __cnt = 0;
     317      _PSTL_PRAGMA_SIMD
     318      for (_DifferenceType __i = 0; __i < __n; ++__i)
     319      {
     320          if (__mask[__i])
     321          {
     322              _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
     323              {
     324                  __assigner(__first + __i, __result + __cnt);
     325                  ++__cnt;
     326              }
     327          }
     328      }
     329  }
     330  
     331  template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2>
     332  void
     333  __simd_partition_by_mask(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
     334                           _OutputIterator2 __out_false, bool* __mask) noexcept
     335  {
     336      _DifferenceType __cnt_true = 0, __cnt_false = 0;
     337      _PSTL_PRAGMA_SIMD
     338      for (_DifferenceType __i = 0; __i < __n; ++__i)
     339      {
     340          _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(__cnt_true : 1, __cnt_false : 1)
     341          if (__mask[__i])
     342          {
     343              __out_true[__cnt_true] = __first[__i];
     344              ++__cnt_true;
     345          }
     346          else
     347          {
     348              __out_false[__cnt_false] = __first[__i];
     349              ++__cnt_false;
     350          }
     351      }
     352  }
     353  
     354  template <class _Index, class _DifferenceType, class _Tp>
     355  _Index
     356  __simd_fill_n(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept
     357  {
     358      _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
     359      _PSTL_PRAGMA_SIMD
     360      for (_DifferenceType __i = 0; __i < __n; ++__i)
     361          __first[__i] = __value;
     362      return __first + __n;
     363  }
     364  
     365  template <class _Index, class _DifferenceType, class _Generator>
     366  _Index
     367  __simd_generate_n(_Index __first, _DifferenceType __size, _Generator __g) noexcept
     368  {
     369      _PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED
     370      _PSTL_PRAGMA_SIMD
     371      for (_DifferenceType __i = 0; __i < __size; ++__i)
     372          __first[__i] = __g();
     373      return __first + __size;
     374  }
     375  
     376  template <class _Index, class _BinaryPredicate>
     377  _Index
     378  __simd_adjacent_find(_Index __first, _Index __last, _BinaryPredicate __pred, bool __or_semantic) noexcept
     379  {
     380      if (__last - __first < 2)
     381          return __last;
     382  
     383      typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
     384      _DifferenceType __i = 0;
     385  
     386  #if _PSTL_EARLYEXIT_PRESENT
     387      //Some compiler versions fail to compile the following loop when iterators are used. Indices are used instead
     388      const _DifferenceType __n = __last - __first - 1;
     389      _PSTL_PRAGMA_VECTOR_UNALIGNED
     390      _PSTL_PRAGMA_SIMD_EARLYEXIT
     391      for (; __i < __n; ++__i)
     392          if (__pred(__first[__i], __first[__i + 1]))
     393              break;
     394  
     395      return __i < __n ? __first + __i : __last;
     396  #else
     397      // Experiments show good block sizes like this
     398      //TODO: to consider tuning block_size for various data types
     399      const _DifferenceType __block_size = 8;
     400      alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
     401      while (__last - __first >= __block_size)
     402      {
     403          _DifferenceType __found = 0;
     404          _PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part
     405              _PSTL_PRAGMA_SIMD_REDUCTION(|
     406                                          : __found) for (__i = 0; __i < __block_size - 1; ++__i)
     407          {
     408              //TODO: to improve SIMD vectorization
     409              const _DifferenceType __t = __pred(*(__first + __i), *(__first + __i + 1));
     410              __lane[__i] = __t;
     411              __found |= __t;
     412          }
     413  
     414          //Process a pair of elements on a boundary of a data block
     415          if (__first + __block_size < __last && __pred(*(__first + __i), *(__first + __i + 1)))
     416              __lane[__i] = __found = 1;
     417  
     418          if (__found)
     419          {
     420              if (__or_semantic)
     421                  return __first;
     422  
     423              // This will vectorize
     424              for (__i = 0; __i < __block_size; ++__i)
     425                  if (__lane[__i])
     426                      break;
     427              return __first + __i; //As far as found is true a __result (__lane[__i] is true) is guaranteed
     428          }
     429          __first += __block_size;
     430      }
     431      //Process the rest elements
     432      for (; __last - __first > 1; ++__first)
     433          if (__pred(*__first, *(__first + 1)))
     434              return __first;
     435  
     436      return __last;
     437  #endif
     438  }
     439  
     440  // It was created to reduce the code inside std::enable_if
     441  template <typename _Tp, typename _BinaryOperation>
     442  using is_arithmetic_plus = std::integral_constant<bool, std::is_arithmetic<_Tp>::value &&
     443                                                              std::is_same<_BinaryOperation, std::plus<_Tp>>::value>;
     444  
     445  template <typename _DifferenceType, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
     446  typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, _Tp>::type
     447  __simd_transform_reduce(_DifferenceType __n, _Tp __init, _BinaryOperation, _UnaryOperation __f) noexcept
     448  {
     449      _PSTL_PRAGMA_SIMD_REDUCTION(+ : __init)
     450      for (_DifferenceType __i = 0; __i < __n; ++__i)
     451          __init += __f(__i);
     452      return __init;
     453  }
     454  
     455  template <typename _Size, typename _Tp, typename _BinaryOperation, typename _UnaryOperation>
     456  typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, _Tp>::type
     457  __simd_transform_reduce(_Size __n, _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __f) noexcept
     458  {
     459      const _Size __block_size = __lane_size / sizeof(_Tp);
     460      if (__n > 2 * __block_size && __block_size > 1)
     461      {
     462          alignas(__lane_size) char __lane_[__lane_size];
     463          _Tp* __lane = reinterpret_cast<_Tp*>(__lane_);
     464  
     465          // initializer
     466          _PSTL_PRAGMA_SIMD
     467          for (_Size __i = 0; __i < __block_size; ++__i)
     468          {
     469              ::new (__lane + __i) _Tp(__binary_op(__f(__i), __f(__block_size + __i)));
     470          }
     471          // main loop
     472          _Size __i = 2 * __block_size;
     473          const _Size last_iteration = __block_size * (__n / __block_size);
     474          for (; __i < last_iteration; __i += __block_size)
     475          {
     476              _PSTL_PRAGMA_SIMD
     477              for (_Size __j = 0; __j < __block_size; ++__j)
     478              {
     479                  __lane[__j] = __binary_op(__lane[__j], __f(__i + __j));
     480              }
     481          }
     482          // remainder
     483          _PSTL_PRAGMA_SIMD
     484          for (_Size __j = 0; __j < __n - last_iteration; ++__j)
     485          {
     486              __lane[__j] = __binary_op(__lane[__j], __f(last_iteration + __j));
     487          }
     488          // combiner
     489          for (_Size __j = 0; __j < __block_size; ++__j)
     490          {
     491              __init = __binary_op(__init, __lane[__j]);
     492          }
     493          // destroyer
     494          _PSTL_PRAGMA_SIMD
     495          for (_Size __j = 0; __j < __block_size; ++__j)
     496          {
     497              __lane[__j].~_Tp();
     498          }
     499      }
     500      else
     501      {
     502          for (_Size __i = 0; __i < __n; ++__i)
     503          {
     504              __init = __binary_op(__init, __f(__i));
     505          }
     506      }
     507      return __init;
     508  }
     509  
     510  // Exclusive scan for "+" and arithmetic types
     511  template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
     512            class _BinaryOperation>
     513  typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
     514  __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
     515              _BinaryOperation, /*Inclusive*/ std::false_type)
     516  {
     517      _PSTL_PRAGMA_SIMD_SCAN(+ : __init)
     518      for (_Size __i = 0; __i < __n; ++__i)
     519      {
     520          __result[__i] = __init;
     521          _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(__init)
     522          __init += __unary_op(__first[__i]);
     523      }
     524      return std::make_pair(__result + __n, __init);
     525  }
     526  
     527  // As soon as we cannot call __binary_op in "combiner" we create a wrapper over _Tp to encapsulate __binary_op
     528  template <typename _Tp, typename _BinaryOp>
     529  struct _Combiner
     530  {
     531      _Tp __value;
     532      _BinaryOp* __bin_op; // Here is a pointer to function because of default ctor
     533  
     534      _Combiner() : __value{}, __bin_op(nullptr) {}
     535      _Combiner(const _Tp& value, const _BinaryOp* __bin_op) : __value(value), __bin_op(const_cast<_BinaryOp*>(__bin_op)) {}
     536      _Combiner(const _Combiner& __obj) : __value{}, __bin_op(__obj.__bin_op) {}
     537  
     538      void
     539      operator()(const _Combiner& __obj)
     540      {
     541          __value = (*__bin_op)(__value, __obj.__value);
     542      }
     543  };
     544  
     545  // Exclusive scan for other binary operations and types
     546  template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
     547            class _BinaryOperation>
     548  typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
     549  __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
     550              _BinaryOperation __binary_op, /*Inclusive*/ std::false_type)
     551  {
     552      typedef _Combiner<_Tp, _BinaryOperation> _CombinerType;
     553      _CombinerType __init_{__init, &__binary_op};
     554  
     555      _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType)
     556  
     557      _PSTL_PRAGMA_SIMD_SCAN(__bin_op : __init_)
     558      for (_Size __i = 0; __i < __n; ++__i)
     559      {
     560          __result[__i] = __init_.__value;
     561          _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(__init_)
     562          _PSTL_PRAGMA_FORCEINLINE
     563          __init_.__value = __binary_op(__init_.__value, __unary_op(__first[__i]));
     564      }
     565      return std::make_pair(__result + __n, __init_.__value);
     566  }
     567  
     568  // Inclusive scan for "+" and arithmetic types
     569  template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
     570            class _BinaryOperation>
     571  typename std::enable_if<is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
     572  __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
     573              _BinaryOperation, /*Inclusive*/ std::true_type)
     574  {
     575      _PSTL_PRAGMA_SIMD_SCAN(+ : __init)
     576      for (_Size __i = 0; __i < __n; ++__i)
     577      {
     578          __init += __unary_op(__first[__i]);
     579          _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(__init)
     580          __result[__i] = __init;
     581      }
     582      return std::make_pair(__result + __n, __init);
     583  }
     584  
     585  // Inclusive scan for other binary operations and types
     586  template <class _InputIterator, class _Size, class _OutputIterator, class _UnaryOperation, class _Tp,
     587            class _BinaryOperation>
     588  typename std::enable_if<!is_arithmetic_plus<_Tp, _BinaryOperation>::value, std::pair<_OutputIterator, _Tp>>::type
     589  __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init,
     590              _BinaryOperation __binary_op, std::true_type)
     591  {
     592      typedef _Combiner<_Tp, _BinaryOperation> _CombinerType;
     593      _CombinerType __init_{__init, &__binary_op};
     594  
     595      _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType)
     596  
     597      _PSTL_PRAGMA_SIMD_SCAN(__bin_op : __init_)
     598      for (_Size __i = 0; __i < __n; ++__i)
     599      {
     600          _PSTL_PRAGMA_FORCEINLINE
     601          __init_.__value = __binary_op(__init_.__value, __unary_op(__first[__i]));
     602          _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(__init_)
     603          __result[__i] = __init_.__value;
     604      }
     605      return std::make_pair(__result + __n, __init_.__value);
     606  }
     607  
     608  // [restriction] - std::iterator_traits<_ForwardIterator>::value_type should be DefaultConstructible.
     609  // complexity [violation] - We will have at most (__n-1 + number_of_lanes) comparisons instead of at most __n-1.
     610  template <typename _ForwardIterator, typename _Size, typename _Compare>
     611  _ForwardIterator
     612  __simd_min_element(_ForwardIterator __first, _Size __n, _Compare __comp) noexcept
     613  {
     614      if (__n == 0)
     615      {
     616          return __first;
     617      }
     618  
     619      typedef typename std::iterator_traits<_ForwardIterator>::value_type _ValueType;
     620      struct _ComplexType
     621      {
     622          _ValueType __min_val;
     623          _Size __min_ind;
     624          _Compare* __min_comp;
     625  
     626          _ComplexType() : __min_val{}, __min_ind{}, __min_comp(nullptr) {}
     627          _ComplexType(const _ValueType& __val, const _Compare* comp)
     628              : __min_val(__val), __min_ind(0), __min_comp(const_cast<_Compare*>(comp))
     629          {
     630          }
     631          _ComplexType(const _ComplexType& __obj)
     632              : __min_val(__obj.__min_val), __min_ind(__obj.__min_ind), __min_comp(__obj.__min_comp)
     633          {
     634          }
     635  
     636          _PSTL_PRAGMA_DECLARE_SIMD
     637          void
     638          operator()(const _ComplexType& __obj)
     639          {
     640              if (!(*__min_comp)(__min_val, __obj.__min_val) &&
     641                  ((*__min_comp)(__obj.__min_val, __min_val) || __obj.__min_ind - __min_ind < 0))
     642              {
     643                  __min_val = __obj.__min_val;
     644                  __min_ind = __obj.__min_ind;
     645              }
     646          }
     647      };
     648  
     649      _ComplexType __init{*__first, &__comp};
     650  
     651      _PSTL_PRAGMA_DECLARE_REDUCTION(__min_func, _ComplexType)
     652  
     653      _PSTL_PRAGMA_SIMD_REDUCTION(__min_func : __init)
     654      for (_Size __i = 1; __i < __n; ++__i)
     655      {
     656          const _ValueType __min_val = __init.__min_val;
     657          const _ValueType __current = __first[__i];
     658          if (__comp(__current, __min_val))
     659          {
     660              __init.__min_val = __current;
     661              __init.__min_ind = __i;
     662          }
     663      }
     664      return __first + __init.__min_ind;
     665  }
     666  
     667  // [restriction] - std::iterator_traits<_ForwardIterator>::value_type should be DefaultConstructible.
     668  // complexity [violation] - We will have at most (2*(__n-1) + 4*number_of_lanes) comparisons instead of at most [1.5*(__n-1)].
     669  template <typename _ForwardIterator, typename _Size, typename _Compare>
     670  std::pair<_ForwardIterator, _ForwardIterator>
     671  __simd_minmax_element(_ForwardIterator __first, _Size __n, _Compare __comp) noexcept
     672  {
     673      if (__n == 0)
     674      {
     675          return std::make_pair(__first, __first);
     676      }
     677      typedef typename std::iterator_traits<_ForwardIterator>::value_type _ValueType;
     678  
     679      struct _ComplexType
     680      {
     681          _ValueType __min_val;
     682          _ValueType __max_val;
     683          _Size __min_ind;
     684          _Size __max_ind;
     685          _Compare* __minmax_comp;
     686  
     687          _ComplexType() : __min_val{}, __max_val{}, __min_ind{}, __max_ind{}, __minmax_comp(nullptr) {}
     688          _ComplexType(const _ValueType& __min_val, const _ValueType& __max_val, const _Compare* comp)
     689              : __min_val(__min_val), __max_val(__max_val), __min_ind(0), __max_ind(0),
     690                __minmax_comp(const_cast<_Compare*>(comp))
     691          {
     692          }
     693          _ComplexType(const _ComplexType& __obj)
     694              : __min_val(__obj.__min_val), __max_val(__obj.__max_val), __min_ind(__obj.__min_ind),
     695                __max_ind(__obj.__max_ind), __minmax_comp(__obj.__minmax_comp)
     696          {
     697          }
     698  
     699          void
     700          operator()(const _ComplexType& __obj)
     701          {
     702              // min
     703              if ((*__minmax_comp)(__obj.__min_val, __min_val))
     704              {
     705                  __min_val = __obj.__min_val;
     706                  __min_ind = __obj.__min_ind;
     707              }
     708              else if (!(*__minmax_comp)(__min_val, __obj.__min_val))
     709              {
     710                  __min_val = __obj.__min_val;
     711                  __min_ind = (__min_ind - __obj.__min_ind < 0) ? __min_ind : __obj.__min_ind;
     712              }
     713  
     714              // max
     715              if ((*__minmax_comp)(__max_val, __obj.__max_val))
     716              {
     717                  __max_val = __obj.__max_val;
     718                  __max_ind = __obj.__max_ind;
     719              }
     720              else if (!(*__minmax_comp)(__obj.__max_val, __max_val))
     721              {
     722                  __max_val = __obj.__max_val;
     723                  __max_ind = (__max_ind - __obj.__max_ind < 0) ? __obj.__max_ind : __max_ind;
     724              }
     725          }
     726      };
     727  
     728      _ComplexType __init{*__first, *__first, &__comp};
     729  
     730      _PSTL_PRAGMA_DECLARE_REDUCTION(__min_func, _ComplexType);
     731  
     732      _PSTL_PRAGMA_SIMD_REDUCTION(__min_func : __init)
     733      for (_Size __i = 1; __i < __n; ++__i)
     734      {
     735          auto __min_val = __init.__min_val;
     736          auto __max_val = __init.__max_val;
     737          auto __current = __first + __i;
     738          if (__comp(*__current, __min_val))
     739          {
     740              __init.__min_val = *__current;
     741              __init.__min_ind = __i;
     742          }
     743          else if (!__comp(*__current, __max_val))
     744          {
     745              __init.__max_val = *__current;
     746              __init.__max_ind = __i;
     747          }
     748      }
     749      return std::make_pair(__first + __init.__min_ind, __first + __init.__max_ind);
     750  }
     751  
     752  template <class _InputIterator, class _DifferenceType, class _OutputIterator1, class _OutputIterator2,
     753            class _UnaryPredicate>
     754  std::pair<_OutputIterator1, _OutputIterator2>
     755  __simd_partition_copy(_InputIterator __first, _DifferenceType __n, _OutputIterator1 __out_true,
     756                        _OutputIterator2 __out_false, _UnaryPredicate __pred) noexcept
     757  {
     758      _DifferenceType __cnt_true = 0, __cnt_false = 0;
     759  
     760      _PSTL_PRAGMA_SIMD
     761      for (_DifferenceType __i = 0; __i < __n; ++__i)
     762      {
     763          _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(__cnt_true : 1, __cnt_false : 1)
     764          if (__pred(__first[__i]))
     765          {
     766              __out_true[__cnt_true] = __first[__i];
     767              ++__cnt_true;
     768          }
     769          else
     770          {
     771              __out_false[__cnt_false] = __first[__i];
     772              ++__cnt_false;
     773          }
     774      }
     775      return std::make_pair(__out_true + __cnt_true, __out_false + __cnt_false);
     776  }
     777  
     778  template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
     779  _ForwardIterator1
     780  __simd_find_first_of(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first,
     781                       _ForwardIterator2 __s_last, _BinaryPredicate __pred) noexcept
     782  {
     783      typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _DifferencType;
     784  
     785      const _DifferencType __n1 = __last - __first;
     786      const _DifferencType __n2 = __s_last - __s_first;
     787      if (__n1 == 0 || __n2 == 0)
     788      {
     789          return __last; // according to the standard
     790      }
     791  
     792      // Common case
     793      // If first sequence larger than second then we'll run simd_first with parameters of first sequence.
     794      // Otherwise, vice versa.
     795      if (__n1 < __n2)
     796      {
     797          for (; __first != __last; ++__first)
     798          {
     799              if (__unseq_backend::__simd_or(
     800                      __s_first, __n2,
     801                      __internal::__equal_value_by_pred<decltype(*__first), _BinaryPredicate>(*__first, __pred)))
     802              {
     803                  return __first;
     804              }
     805          }
     806      }
     807      else
     808      {
     809          for (; __s_first != __s_last; ++__s_first)
     810          {
     811              const auto __result = __unseq_backend::__simd_first(
     812                  __first, _DifferencType(0), __n1, [__s_first, &__pred](_ForwardIterator1 __it, _DifferencType __i) {
     813                      return __pred(__it[__i], *__s_first);
     814                  });
     815              if (__result != __last)
     816              {
     817                  return __result;
     818              }
     819          }
     820      }
     821      return __last;
     822  }
     823  
     824  template <class _RandomAccessIterator, class _DifferenceType, class _UnaryPredicate>
     825  _RandomAccessIterator
     826  __simd_remove_if(_RandomAccessIterator __first, _DifferenceType __n, _UnaryPredicate __pred) noexcept
     827  {
     828      // find first element we need to remove
     829      auto __current = __unseq_backend::__simd_first(
     830          __first, _DifferenceType(0), __n,
     831          [&__pred](_RandomAccessIterator __it, _DifferenceType __i) { return __pred(__it[__i]); });
     832      __n -= __current - __first;
     833  
     834      // if we have in sequence only one element that pred(__current[1]) != false we can exit the function
     835      if (__n < 2)
     836      {
     837          return __current;
     838      }
     839  
     840      _DifferenceType __cnt = 0;
     841      _PSTL_PRAGMA_SIMD
     842      for (_DifferenceType __i = 1; __i < __n; ++__i)
     843      {
     844          _PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(__cnt : 1)
     845          if (!__pred(__current[__i]))
     846          {
     847              __current[__cnt] = std::move(__current[__i]);
     848              ++__cnt;
     849          }
     850      }
     851      return __current + __cnt;
     852  }
     853  } // namespace __unseq_backend
     854  } // namespace __pstl
     855  
     856  #endif /* _PSTL_UNSEQ_BACKEND_SIMD_H */