(root)/
gcc-13.2.0/
libstdc++-v3/
include/
parallel/
settings.h
       1  // -*- C++ -*-
       2  
       3  // Copyright (C) 2007-2023 Free Software Foundation, Inc.
       4  //
       5  // This file is part of the GNU ISO C++ Library.  This library is free
       6  // software; you can redistribute it and/or modify it under the terms
       7  // of the GNU General Public License as published by the Free Software
       8  // Foundation; either version 3, or (at your option) any later
       9  // version.
      10  
      11  // This library is distributed in the hope that it will be useful, but
      12  // WITHOUT ANY WARRANTY; without even the implied warranty of
      13  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14  // General Public License for more details.
      15  
      16  // Under Section 7 of GPL version 3, you are granted additional
      17  // permissions described in the GCC Runtime Library Exception, version
      18  // 3.1, as published by the Free Software Foundation.
      19  
      20  // You should have received a copy of the GNU General Public License and
      21  // a copy of the GCC Runtime Library Exception along with this program;
      22  // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      23  // <http://www.gnu.org/licenses/>.
      24  
      25  /** @file parallel/settings.h
      26   *  @brief Runtime settings and tuning parameters, heuristics to decide
      27   *  whether to use parallelized algorithms.
      28   *
      29   *  This file is a GNU parallel extension to the Standard C++ Library.
      30   *
      31   *  @section parallelization_decision Deciding whether to run an algorithm in parallel.
      32   *
      33   *  There are several ways the user can switch on and off the parallel
      34   *  execution of an algorithm, both at compile- and run-time.
      35   *
      36   *  Only sequential execution can be forced at compile-time.  This
      37   *  reduces code size and protects code parts that have
      38   *  non-thread-safe side effects.
      39   *
      40   *  Ultimately, forcing parallel execution at compile-time makes
      41   *  sense.  Often, the sequential algorithm implementation is used as
      42   *  a subroutine, so no reduction in code size can be achieved.  Also,
      43   *  the machine the program is run on might have only one processor
      44   *  core, so to avoid overhead, the algorithm is executed
      45   *  sequentially.
      46   *
      47   *  To force sequential execution of an algorithm ultimately at
      48   *  compile-time, the user must add the tag
      49  *  gnu_parallel::sequential_tag() to the end of the parameter list,
      50   *  e. g.
      51   *
      52   *  \code
      53   *  std::sort(__v.begin(), __v.end(), __gnu_parallel::sequential_tag());
      54   *  \endcode
      55   *
      56   *  This is compatible with all overloaded algorithm variants.  No
      57   *  additional code will be instantiated, at all.  The same holds for
      58   *  most algorithm calls with iterators not providing random access.
      59   *
      60   *  If the algorithm call is not forced to be executed sequentially
      61   *  at compile-time, the decision is made at run-time.
      62   *  The global variable __gnu_parallel::_Settings::algorithm_strategy
      63   *  is checked. It is a tristate variable corresponding to:
      64   *    - a. force_sequential, meaning the sequential algorithm is executed.
      65   *    - b. force_parallel, meaning the parallel algorithm is executed.
      66   *    - c. heuristic
      67   *
      68   *  For heuristic, the parallel algorithm implementation is called
      69   *  only if the input size is sufficiently large.  For most
      70   *  algorithms, the input size is the (combined) length of the input
      71   *  sequence(__s).  The threshold can be set by the user, individually
      72   *  for each algorithm.  The according variables are called
      73   *  gnu_parallel::_Settings::[algorithm]_minimal_n .
      74   *
      75   *  For some of the algorithms, there are even more tuning options,
      76   *  e. g. the ability to choose from multiple algorithm variants.  See
      77   *  below for details.
      78   */
      79  
      80  // Written by Johannes Singler and Felix Putze.
      81  
      82  #ifndef _GLIBCXX_PARALLEL_SETTINGS_H
      83  #define _GLIBCXX_PARALLEL_SETTINGS_H 1
      84  
      85  #include <parallel/types.h>
      86  
      87  /** 
      88    * @brief Determine at compile(?)-time if the parallel variant of an
      89    * algorithm should be called.
      90    * @param __c A condition that is convertible to bool that is overruled by
      91    * __gnu_parallel::_Settings::algorithm_strategy. Usually a decision
      92    * based on the input size.
      93    */
      94  #define _GLIBCXX_PARALLEL_CONDITION(__c) \
      95    (__gnu_parallel::_Settings::get().algorithm_strategy \
      96      != __gnu_parallel::force_sequential \
      97    && ((__gnu_parallel::__get_max_threads() > 1 && (__c)) \
      98       || __gnu_parallel::_Settings::get().algorithm_strategy \
      99          == __gnu_parallel::force_parallel))
     100  
     101  /*
     102  inline bool
     103  parallel_condition(bool __c)
     104  {
     105    bool ret = false;
     106    const _Settings& __s = _Settings::get();
     107    if (__s.algorithm_strategy != force_seqential)
     108      {
     109        if (__s.algorithm_strategy == force_parallel)
     110          ret = true;
     111        else
     112          ret = __get_max_threads() > 1 && __c;
     113      }
     114    return ret;
     115  }
     116  */
     117  
     118  namespace __gnu_parallel
     119  {
     120    /// class _Settings
     121    /// Run-time settings for the parallel mode including all tunable parameters.
     122    struct _Settings
     123    {
     124      _AlgorithmStrategy          algorithm_strategy;
     125      
     126      _SortAlgorithm              sort_algorithm;
     127      _PartialSumAlgorithm        partial_sum_algorithm;
     128      _MultiwayMergeAlgorithm     multiway_merge_algorithm;
     129      _FindAlgorithm              find_algorithm;
     130  
     131      _SplittingAlgorithm         sort_splitting;
     132      _SplittingAlgorithm         merge_splitting;
     133      _SplittingAlgorithm         multiway_merge_splitting;
     134  
     135      // Per-algorithm settings.
     136  
     137      /// Minimal input size for accumulate.
     138      _SequenceIndex              accumulate_minimal_n;
     139  
     140      /// Minimal input size for adjacent_difference.
     141      unsigned int                adjacent_difference_minimal_n;
     142  
     143      /// Minimal input size for count and count_if.
     144      _SequenceIndex              count_minimal_n;
     145  
     146      /// Minimal input size for fill.
     147      _SequenceIndex              fill_minimal_n;
     148  
     149      /// Block size increase factor for find.
     150      double                      find_increasing_factor;
     151  
     152      /// Initial block size for find.
     153      _SequenceIndex              find_initial_block_size;
     154  
     155      /// Maximal block size for find.
     156      _SequenceIndex              find_maximum_block_size;
     157  
     158      /// Start with looking for this many elements sequentially, for find.
     159      _SequenceIndex              find_sequential_search_size;
     160  
     161      /// Minimal input size for for_each.
     162      _SequenceIndex              for_each_minimal_n;
     163  
     164      /// Minimal input size for generate.
     165      _SequenceIndex              generate_minimal_n;
     166  
     167      /// Minimal input size for max_element.
     168      _SequenceIndex              max_element_minimal_n;
     169  
     170      /// Minimal input size for merge.
     171      _SequenceIndex              merge_minimal_n;
     172  
     173      /// Oversampling factor for merge.
     174      unsigned int                merge_oversampling;
     175  
     176      /// Minimal input size for min_element.
     177      _SequenceIndex              min_element_minimal_n;
     178  
     179      /// Minimal input size for multiway_merge.
     180      _SequenceIndex              multiway_merge_minimal_n;
     181  
     182      /// Oversampling factor for multiway_merge.
     183      int                         multiway_merge_minimal_k;
     184  
     185      /// Oversampling factor for multiway_merge.
     186      unsigned int                multiway_merge_oversampling;
     187  
     188      /// Minimal input size for nth_element.
     189      _SequenceIndex              nth_element_minimal_n;
     190  
     191      /// Chunk size for partition.
     192      _SequenceIndex              partition_chunk_size;
     193  
     194      /// Chunk size for partition, relative to input size.  If > 0.0,
     195      /// this value overrides partition_chunk_size.
     196      double                      partition_chunk_share;
     197  
     198      /// Minimal input size for partition.
     199      _SequenceIndex              partition_minimal_n;
     200  
     201      /// Minimal input size for partial_sort.
     202      _SequenceIndex              partial_sort_minimal_n;
     203  
     204      /// Ratio for partial_sum. Assume "sum and write result" to be
     205      /// this factor slower than just "sum".
     206      float                       partial_sum_dilation;
     207  
     208      /// Minimal input size for partial_sum.
     209      unsigned int                partial_sum_minimal_n;
     210  
     211      /// Minimal input size for random_shuffle.
     212      unsigned int                random_shuffle_minimal_n;
     213  
     214      /// Minimal input size for replace and replace_if.
     215      _SequenceIndex              replace_minimal_n;
     216  
     217      /// Minimal input size for set_difference.
     218      _SequenceIndex              set_difference_minimal_n;
     219  
     220      /// Minimal input size for set_intersection.
     221      _SequenceIndex              set_intersection_minimal_n;
     222  
     223      /// Minimal input size for set_symmetric_difference.
     224      _SequenceIndex              set_symmetric_difference_minimal_n;
     225  
     226      /// Minimal input size for set_union.
     227      _SequenceIndex              set_union_minimal_n;
     228  
     229      /// Minimal input size for parallel sorting.
     230      _SequenceIndex              sort_minimal_n;
     231  
     232      /// Oversampling factor for parallel std::sort (MWMS).
     233      unsigned int                sort_mwms_oversampling;
     234  
     235      /// Such many samples to take to find a good pivot (quicksort).
     236      unsigned int                sort_qs_num_samples_preset;
     237  
     238      /// Maximal subsequence __length to switch to unbalanced __base case.
     239      /// Applies to std::sort with dynamically load-balanced quicksort.
     240      _SequenceIndex              sort_qsb_base_case_maximal_n;
     241  
     242      /// Minimal input size for parallel std::transform.
     243      _SequenceIndex              transform_minimal_n;
     244  
     245      /// Minimal input size for unique_copy. 
     246      _SequenceIndex              unique_copy_minimal_n;
     247  
     248      _SequenceIndex              workstealing_chunk_size;
     249  
     250      // Hardware dependent tuning parameters.
     251  
     252      /// size of the L1 cache in bytes (underestimation).
     253      unsigned long long          L1_cache_size;
     254  
     255      /// size of the L2 cache in bytes (underestimation).
     256      unsigned long long          L2_cache_size;
     257  
     258      /// size of the Translation Lookaside Buffer (underestimation).
     259      unsigned int                TLB_size;
     260  
     261      /// Overestimation of cache line size.  Used to avoid false
     262      /// sharing, i.e. elements of different threads are at least this
     263      /// amount apart.
     264      unsigned int                cache_line_size;
     265  
     266      // Statistics.
     267  
     268      /// The number of stolen ranges in load-balanced quicksort.
     269      _SequenceIndex              qsb_steals;
     270  
     271      /// Minimal input size for search and search_n.
     272      _SequenceIndex              search_minimal_n;
     273  
     274      /// Block size scale-down factor with respect to current position.
     275      float                       find_scale_factor;
     276  
     277      /// Get the global settings.
     278      _GLIBCXX_CONST static const _Settings&
     279      get() throw();
     280  
     281      /// Set the global settings.
     282      static void
     283      set(_Settings&) throw();
     284  
     285      explicit 
     286      _Settings() :
     287              algorithm_strategy(heuristic),
     288              sort_algorithm(MWMS),
     289              partial_sum_algorithm(LINEAR),
     290              multiway_merge_algorithm(LOSER_TREE),
     291              find_algorithm(CONSTANT_SIZE_BLOCKS),
     292              sort_splitting(EXACT),
     293              merge_splitting(EXACT),
     294              multiway_merge_splitting(EXACT),
     295              accumulate_minimal_n(1000),
     296              adjacent_difference_minimal_n(1000),
     297              count_minimal_n(1000),
     298              fill_minimal_n(1000),
     299              find_increasing_factor(2.0),
     300              find_initial_block_size(256),
     301              find_maximum_block_size(8192),
     302              find_sequential_search_size(256),
     303              for_each_minimal_n(1000),
     304              generate_minimal_n(1000),
     305              max_element_minimal_n(1000),
     306              merge_minimal_n(1000),
     307              merge_oversampling(10),
     308              min_element_minimal_n(1000),
     309              multiway_merge_minimal_n(1000),
     310              multiway_merge_minimal_k(2), multiway_merge_oversampling(10),
     311              nth_element_minimal_n(1000),
     312              partition_chunk_size(1000),
     313              partition_chunk_share(0.0),
     314              partition_minimal_n(1000),
     315              partial_sort_minimal_n(1000),
     316              partial_sum_dilation(1.0f),
     317              partial_sum_minimal_n(1000),
     318              random_shuffle_minimal_n(1000),
     319              replace_minimal_n(1000),
     320              set_difference_minimal_n(1000),
     321              set_intersection_minimal_n(1000),
     322              set_symmetric_difference_minimal_n(1000),
     323              set_union_minimal_n(1000),
     324              sort_minimal_n(1000),
     325              sort_mwms_oversampling(10),
     326              sort_qs_num_samples_preset(100),
     327              sort_qsb_base_case_maximal_n(100),
     328              transform_minimal_n(1000),
     329              unique_copy_minimal_n(10000),
     330              workstealing_chunk_size(100),
     331              L1_cache_size(16 << 10),
     332              L2_cache_size(256 << 10),
     333              TLB_size(128),
     334              cache_line_size(64),
     335              qsb_steals(0),
     336              search_minimal_n(1000),
     337              find_scale_factor(0.01f)
     338      { }
     339    };
     340  }
     341  
     342  #endif /* _GLIBCXX_PARALLEL_SETTINGS_H */