1  // Optimizations for random number functions, x86 version -*- C++ -*-
       2  
       3  // Copyright (C) 2012-2023 Free Software Foundation, Inc.
       4  //
       5  // This file is part of the GNU ISO C++ Library.  This library is free
       6  // software; you can redistribute it and/or modify it under the
       7  // terms of the GNU General Public License as published by the
       8  // Free Software Foundation; either version 3, or (at your option)
       9  // any later version.
      10  
      11  // This library is distributed in the hope that it will be useful,
      12  // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14  // GNU General Public License for more details.
      15  
      16  // Under Section 7 of GPL version 3, you are granted additional
      17  // permissions described in the GCC Runtime Library Exception, version
      18  // 3.1, as published by the Free Software Foundation.
      19  
      20  // You should have received a copy of the GNU General Public License and
      21  // a copy of the GCC Runtime Library Exception along with this program;
      22  // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      23  // <http://www.gnu.org/licenses/>.
      24  
      25  /** @file bits/opt_random.h
      26   *  This is an internal header file, included by other library headers.
      27   *  Do not attempt to use it directly. @headername{random}
      28   */
      29  
      30  #ifndef _BITS_OPT_RANDOM_H
      31  #define _BITS_OPT_RANDOM_H 1
      32  
      33  #ifdef __SSE3__
      34  #include <pmmintrin.h>
      35  #endif
      36  
      37  
      38  #pragma GCC system_header
      39  
      40  
      41  namespace std _GLIBCXX_VISIBILITY(default)
      42  {
      43  _GLIBCXX_BEGIN_NAMESPACE_VERSION
      44  
      45  #ifdef __SSE3__
      46    template<>
      47      template<typename _UniformRandomNumberGenerator>
      48        void
      49        normal_distribution<double>::
      50        __generate(typename normal_distribution<double>::result_type* __f,
      51  		 typename normal_distribution<double>::result_type* __t,
      52  		 _UniformRandomNumberGenerator& __urng,
      53  		 const param_type& __param)
      54        {
      55  	typedef uint64_t __uctype;
      56  
      57  	if (__f == __t)
      58  	  return;
      59  
      60  	if (_M_saved_available)
      61  	  {
      62  	    _M_saved_available = false;
      63  	    *__f++ = _M_saved * __param.stddev() + __param.mean();
      64  
      65  	    if (__f == __t)
      66  	      return;
      67  	  }
      68  
      69  	constexpr uint64_t __maskval = 0xfffffffffffffull;
      70  	static const __m128i __mask = _mm_set1_epi64x(__maskval);
      71  	static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull);
      72  	static const __m128d __three = _mm_set1_pd(3.0);
      73  	const __m128d __av = _mm_set1_pd(__param.mean());
      74  
      75  	const __uctype __urngmin = __urng.min();
      76  	const __uctype __urngmax = __urng.max();
      77  	const __uctype __urngrange = __urngmax - __urngmin;
      78  	const __uctype __uerngrange = __urngrange + 1;
      79  
      80  	while (__f + 1 < __t)
      81  	  {
      82  	    double __le;
      83  	    __m128d __x;
      84  	    do
      85  	      {
      86                  union
      87                  {
      88                    __m128i __i;
      89                    __m128d __d;
      90  		} __v;
      91  
      92  		if (__urngrange > __maskval)
      93  		  {
      94  		    if (__detail::_Power_of_2(__uerngrange))
      95  		      __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(),
      96  							     __urng()),
      97  					      __mask);
      98  		    else
      99  		      {
     100  			const __uctype __uerange = __maskval + 1;
     101  			const __uctype __scaling = __urngrange / __uerange;
     102  			const __uctype __past = __uerange * __scaling;
     103  			uint64_t __v1;
     104  			do
     105  			  __v1 = __uctype(__urng()) - __urngmin;
     106  			while (__v1 >= __past);
     107  			__v1 /= __scaling;
     108  			uint64_t __v2;
     109  			do
     110  			  __v2 = __uctype(__urng()) - __urngmin;
     111  			while (__v2 >= __past);
     112  			__v2 /= __scaling;
     113  
     114  			__v.__i = _mm_set_epi64x(__v1, __v2);
     115  		      }
     116  		  }
     117  		else if (__urngrange == __maskval)
     118  		  __v.__i = _mm_set_epi64x(__urng(), __urng());
     119  		else if ((__urngrange + 2) * __urngrange >= __maskval
     120  			 && __detail::_Power_of_2(__uerngrange))
     121  		  {
     122  		    uint64_t __v1 = __urng() * __uerngrange + __urng();
     123  		    uint64_t __v2 = __urng() * __uerngrange + __urng();
     124  
     125  		    __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2),
     126  					    __mask);
     127  		  }
     128  		else
     129  		  {
     130  		    size_t __nrng = 2;
     131  		    __uctype __high = __maskval / __uerngrange / __uerngrange;
     132  		    while (__high > __uerngrange)
     133  		      {
     134  			++__nrng;
     135  			__high /= __uerngrange;
     136  		      }
     137  		    const __uctype __highrange = __high + 1;
     138  		    const __uctype __scaling = __urngrange / __highrange;
     139  		    const __uctype __past = __highrange * __scaling;
     140  		    __uctype __tmp;
     141  
     142  		    uint64_t __v1;
     143  		    do
     144  		      {
     145  			do
     146  			  __tmp = __uctype(__urng()) - __urngmin;
     147  			while (__tmp >= __past);
     148  			__v1 = __tmp / __scaling;
     149  			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
     150  			  {
     151  			    __tmp = __v1;
     152  			    __v1 *= __uerngrange;
     153  			    __v1 += __uctype(__urng()) - __urngmin;
     154  			  }
     155  		      }
     156  		    while (__v1 > __maskval || __v1 < __tmp);
     157  
     158  		    uint64_t __v2;
     159  		    do
     160  		      {
     161  			do
     162  			  __tmp = __uctype(__urng()) - __urngmin;
     163  			while (__tmp >= __past);
     164  			__v2 = __tmp / __scaling;
     165  			for (size_t __cnt = 0; __cnt < __nrng; ++__cnt)
     166  			  {
     167  			    __tmp = __v2;
     168  			    __v2 *= __uerngrange;
     169  			    __v2 += __uctype(__urng()) - __urngmin;
     170  			  }
     171  		      }
     172  		    while (__v2 > __maskval || __v2 < __tmp);
     173  
     174  		    __v.__i = _mm_set_epi64x(__v1, __v2);
     175  		  }
     176  
     177  		__v.__i = _mm_or_si128(__v.__i, __two);
     178  		__x = _mm_sub_pd(__v.__d, __three);
     179  		__m128d __m = _mm_mul_pd(__x, __x);
     180  		__le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m));
     181                }
     182              while (__le == 0.0 || __le >= 1.0);
     183  
     184              double __mult = (std::sqrt(-2.0 * std::log(__le) / __le)
     185                               * __param.stddev());
     186  
     187              __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av);
     188  
     189              _mm_storeu_pd(__f, __x);
     190              __f += 2;
     191            }
     192  
     193          if (__f != __t)
     194            {
     195              result_type __x, __y, __r2;
     196  
     197              __detail::_Adaptor<_UniformRandomNumberGenerator, result_type>
     198                __aurng(__urng);
     199  
     200              do
     201                {
     202                  __x = result_type(2.0) * __aurng() - 1.0;
     203                  __y = result_type(2.0) * __aurng() - 1.0;
     204                  __r2 = __x * __x + __y * __y;
     205                }
     206              while (__r2 > 1.0 || __r2 == 0.0);
     207  
     208              const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2);
     209              _M_saved = __x * __mult;
     210              _M_saved_available = true;
     211              *__f = __y * __mult * __param.stddev() + __param.mean();
     212            }
     213        }
     214  #endif
     215  
     216  
     217  _GLIBCXX_END_NAMESPACE_VERSION
     218  } // namespace
     219  
     220  
     221  #endif // _BITS_OPT_RANDOM_H