1  /* Copyright (C) 2003-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3, or (at your option)
       8     any later version.
       9  
      10     GCC is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  /* Implemented from the specification included in the Intel C++ Compiler
      25     User Guide and Reference, version 9.0.  */
      26  
      27  #ifndef _EMMINTRIN_H_INCLUDED
      28  #define _EMMINTRIN_H_INCLUDED
      29  
      30  /* We need definitions from the SSE header files*/
      31  #include <xmmintrin.h>
      32  
      33  #ifndef __SSE2__
      34  #pragma GCC push_options
      35  #pragma GCC target("sse2")
      36  #define __DISABLE_SSE2__
      37  #endif /* __SSE2__ */
      38  
      39  /* SSE2 */
      40  typedef double __v2df __attribute__ ((__vector_size__ (16)));
      41  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
      42  typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
      43  typedef int __v4si __attribute__ ((__vector_size__ (16)));
      44  typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
      45  typedef short __v8hi __attribute__ ((__vector_size__ (16)));
      46  typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
      47  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
      48  typedef signed char __v16qs __attribute__ ((__vector_size__ (16)));
      49  typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
      50  
      51  /* The Intel API is flexible enough that we must allow aliasing with other
      52     vector types, and their scalar components.  */
      53  typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
      54  typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
      55  
      56  /* Unaligned version of the same types.  */
      57  typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
      58  typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
      59  
      60  /* Create a selector for use with the SHUFPD instruction.  */
      61  #define _MM_SHUFFLE2(fp1,fp0) \
      62   (((fp1) << 1) | (fp0))
      63  
      64  /* Create a vector with element 0 as F and the rest zero.  */
      65  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      66  _mm_set_sd (double __F)
      67  {
      68    return __extension__ (__m128d){ __F, 0.0 };
      69  }
      70  
      71  /* Create a vector with both elements equal to F.  */
      72  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      73  _mm_set1_pd (double __F)
      74  {
      75    return __extension__ (__m128d){ __F, __F };
      76  }
      77  
      78  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      79  _mm_set_pd1 (double __F)
      80  {
      81    return _mm_set1_pd (__F);
      82  }
      83  
      84  /* Create a vector with the lower value X and upper value W.  */
      85  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      86  _mm_set_pd (double __W, double __X)
      87  {
      88    return __extension__ (__m128d){ __X, __W };
      89  }
      90  
      91  /* Create a vector with the lower value W and upper value X.  */
      92  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      93  _mm_setr_pd (double __W, double __X)
      94  {
      95    return __extension__ (__m128d){ __W, __X };
      96  }
      97  
      98  /* Create an undefined vector.  */
      99  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     100  _mm_undefined_pd (void)
     101  {
     102  #pragma GCC diagnostic push
     103  #pragma GCC diagnostic ignored "-Winit-self"
     104    __m128d __Y = __Y;
     105  #pragma GCC diagnostic pop
     106    return __Y;
     107  }
     108  
     109  /* Create a vector of zeros.  */
     110  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     111  _mm_setzero_pd (void)
     112  {
     113    return __extension__ (__m128d){ 0.0, 0.0 };
     114  }
     115  
     116  /* Sets the low DPFP value of A from the low value of B.  */
     117  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     118  _mm_move_sd (__m128d __A, __m128d __B)
     119  {
     120    return __extension__ (__m128d) __builtin_shuffle ((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
     121  }
     122  
     123  /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
     124  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     125  _mm_load_pd (double const *__P)
     126  {
     127    return *(__m128d *)__P;
     128  }
     129  
     130  /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
     131  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     132  _mm_loadu_pd (double const *__P)
     133  {
     134    return *(__m128d_u *)__P;
     135  }
     136  
     137  /* Create a vector with all two elements equal to *P.  */
     138  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     139  _mm_load1_pd (double const *__P)
     140  {
     141    return _mm_set1_pd (*__P);
     142  }
     143  
     144  /* Create a vector with element 0 as *P and the rest zero.  */
     145  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     146  _mm_load_sd (double const *__P)
     147  {
     148    return _mm_set_sd (*__P);
     149  }
     150  
     151  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     152  _mm_load_pd1 (double const *__P)
     153  {
     154    return _mm_load1_pd (__P);
     155  }
     156  
     157  /* Load two DPFP values in reverse order.  The address must be aligned.  */
     158  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     159  _mm_loadr_pd (double const *__P)
     160  {
     161    __m128d __tmp = _mm_load_pd (__P);
     162    return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
     163  }
     164  
     165  /* Store two DPFP values.  The address must be 16-byte aligned.  */
     166  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     167  _mm_store_pd (double *__P, __m128d __A)
     168  {
     169    *(__m128d *)__P = __A;
     170  }
     171  
     172  /* Store two DPFP values.  The address need not be 16-byte aligned.  */
     173  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     174  _mm_storeu_pd (double *__P, __m128d __A)
     175  {
     176    *(__m128d_u *)__P = __A;
     177  }
     178  
     179  /* Stores the lower DPFP value.  */
     180  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     181  _mm_store_sd (double *__P, __m128d __A)
     182  {
     183    *__P = ((__v2df)__A)[0];
     184  }
     185  
     186  extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     187  _mm_cvtsd_f64 (__m128d __A)
     188  {
     189    return ((__v2df)__A)[0];
     190  }
     191  
     192  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     193  _mm_storel_pd (double *__P, __m128d __A)
     194  {
     195    _mm_store_sd (__P, __A);
     196  }
     197  
     198  /* Stores the upper DPFP value.  */
     199  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     200  _mm_storeh_pd (double *__P, __m128d __A)
     201  {
     202    *__P = ((__v2df)__A)[1];
     203  }
     204  
     205  /* Store the lower DPFP value across two words.
     206     The address must be 16-byte aligned.  */
     207  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     208  _mm_store1_pd (double *__P, __m128d __A)
     209  {
     210    _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
     211  }
     212  
     213  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     214  _mm_store_pd1 (double *__P, __m128d __A)
     215  {
     216    _mm_store1_pd (__P, __A);
     217  }
     218  
     219  /* Store two DPFP values in reverse order.  The address must be aligned.  */
     220  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     221  _mm_storer_pd (double *__P, __m128d __A)
     222  {
     223    _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
     224  }
     225  
     226  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     227  _mm_cvtsi128_si32 (__m128i __A)
     228  {
     229    return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
     230  }
     231  
     232  #ifdef __x86_64__
     233  /* Intel intrinsic.  */
     234  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     235  _mm_cvtsi128_si64 (__m128i __A)
     236  {
     237    return ((__v2di)__A)[0];
     238  }
     239  
     240  /* Microsoft intrinsic.  */
     241  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     242  _mm_cvtsi128_si64x (__m128i __A)
     243  {
     244    return ((__v2di)__A)[0];
     245  }
     246  #endif
     247  
     248  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     249  _mm_add_pd (__m128d __A, __m128d __B)
     250  {
     251    return (__m128d) ((__v2df)__A + (__v2df)__B);
     252  }
     253  
     254  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     255  _mm_add_sd (__m128d __A, __m128d __B)
     256  {
     257    return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
     258  }
     259  
     260  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     261  _mm_sub_pd (__m128d __A, __m128d __B)
     262  {
     263    return (__m128d) ((__v2df)__A - (__v2df)__B);
     264  }
     265  
     266  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     267  _mm_sub_sd (__m128d __A, __m128d __B)
     268  {
     269    return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
     270  }
     271  
     272  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     273  _mm_mul_pd (__m128d __A, __m128d __B)
     274  {
     275    return (__m128d) ((__v2df)__A * (__v2df)__B);
     276  }
     277  
     278  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     279  _mm_mul_sd (__m128d __A, __m128d __B)
     280  {
     281    return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
     282  }
     283  
     284  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     285  _mm_div_pd (__m128d __A, __m128d __B)
     286  {
     287    return (__m128d) ((__v2df)__A / (__v2df)__B);
     288  }
     289  
     290  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     291  _mm_div_sd (__m128d __A, __m128d __B)
     292  {
     293    return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
     294  }
     295  
     296  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     297  _mm_sqrt_pd (__m128d __A)
     298  {
     299    return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
     300  }
     301  
     302  /* Return pair {sqrt (B[0]), A[1]}.  */
     303  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     304  _mm_sqrt_sd (__m128d __A, __m128d __B)
     305  {
     306    __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
     307    return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
     308  }
     309  
     310  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     311  _mm_min_pd (__m128d __A, __m128d __B)
     312  {
     313    return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
     314  }
     315  
     316  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     317  _mm_min_sd (__m128d __A, __m128d __B)
     318  {
     319    return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
     320  }
     321  
     322  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     323  _mm_max_pd (__m128d __A, __m128d __B)
     324  {
     325    return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
     326  }
     327  
     328  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     329  _mm_max_sd (__m128d __A, __m128d __B)
     330  {
     331    return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
     332  }
     333  
     334  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     335  _mm_and_pd (__m128d __A, __m128d __B)
     336  {
     337    return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
     338  }
     339  
     340  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     341  _mm_andnot_pd (__m128d __A, __m128d __B)
     342  {
     343    return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
     344  }
     345  
     346  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     347  _mm_or_pd (__m128d __A, __m128d __B)
     348  {
     349    return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
     350  }
     351  
     352  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     353  _mm_xor_pd (__m128d __A, __m128d __B)
     354  {
     355    return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
     356  }
     357  
     358  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     359  _mm_cmpeq_pd (__m128d __A, __m128d __B)
     360  {
     361    return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
     362  }
     363  
     364  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     365  _mm_cmplt_pd (__m128d __A, __m128d __B)
     366  {
     367    return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
     368  }
     369  
     370  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     371  _mm_cmple_pd (__m128d __A, __m128d __B)
     372  {
     373    return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
     374  }
     375  
     376  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     377  _mm_cmpgt_pd (__m128d __A, __m128d __B)
     378  {
     379    return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
     380  }
     381  
     382  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     383  _mm_cmpge_pd (__m128d __A, __m128d __B)
     384  {
     385    return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
     386  }
     387  
     388  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     389  _mm_cmpneq_pd (__m128d __A, __m128d __B)
     390  {
     391    return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
     392  }
     393  
     394  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     395  _mm_cmpnlt_pd (__m128d __A, __m128d __B)
     396  {
     397    return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
     398  }
     399  
     400  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     401  _mm_cmpnle_pd (__m128d __A, __m128d __B)
     402  {
     403    return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
     404  }
     405  
     406  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     407  _mm_cmpngt_pd (__m128d __A, __m128d __B)
     408  {
     409    return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
     410  }
     411  
     412  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     413  _mm_cmpnge_pd (__m128d __A, __m128d __B)
     414  {
     415    return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
     416  }
     417  
     418  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     419  _mm_cmpord_pd (__m128d __A, __m128d __B)
     420  {
     421    return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
     422  }
     423  
     424  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     425  _mm_cmpunord_pd (__m128d __A, __m128d __B)
     426  {
     427    return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
     428  }
     429  
     430  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     431  _mm_cmpeq_sd (__m128d __A, __m128d __B)
     432  {
     433    return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
     434  }
     435  
     436  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     437  _mm_cmplt_sd (__m128d __A, __m128d __B)
     438  {
     439    return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
     440  }
     441  
     442  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     443  _mm_cmple_sd (__m128d __A, __m128d __B)
     444  {
     445    return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
     446  }
     447  
     448  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     449  _mm_cmpgt_sd (__m128d __A, __m128d __B)
     450  {
     451    return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
     452  					 (__v2df)
     453  					 __builtin_ia32_cmpltsd ((__v2df) __B,
     454  								 (__v2df)
     455  								 __A));
     456  }
     457  
     458  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     459  _mm_cmpge_sd (__m128d __A, __m128d __B)
     460  {
     461    return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
     462  					 (__v2df)
     463  					 __builtin_ia32_cmplesd ((__v2df) __B,
     464  								 (__v2df)
     465  								 __A));
     466  }
     467  
     468  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     469  _mm_cmpneq_sd (__m128d __A, __m128d __B)
     470  {
     471    return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
     472  }
     473  
     474  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     475  _mm_cmpnlt_sd (__m128d __A, __m128d __B)
     476  {
     477    return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
     478  }
     479  
     480  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     481  _mm_cmpnle_sd (__m128d __A, __m128d __B)
     482  {
     483    return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
     484  }
     485  
     486  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     487  _mm_cmpngt_sd (__m128d __A, __m128d __B)
     488  {
     489    return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
     490  					 (__v2df)
     491  					 __builtin_ia32_cmpnltsd ((__v2df) __B,
     492  								  (__v2df)
     493  								  __A));
     494  }
     495  
     496  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     497  _mm_cmpnge_sd (__m128d __A, __m128d __B)
     498  {
     499    return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
     500  					 (__v2df)
     501  					 __builtin_ia32_cmpnlesd ((__v2df) __B,
     502  								  (__v2df)
     503  								  __A));
     504  }
     505  
     506  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     507  _mm_cmpord_sd (__m128d __A, __m128d __B)
     508  {
     509    return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
     510  }
     511  
     512  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     513  _mm_cmpunord_sd (__m128d __A, __m128d __B)
     514  {
     515    return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
     516  }
     517  
     518  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     519  _mm_comieq_sd (__m128d __A, __m128d __B)
     520  {
     521    return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
     522  }
     523  
     524  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     525  _mm_comilt_sd (__m128d __A, __m128d __B)
     526  {
     527    return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
     528  }
     529  
     530  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     531  _mm_comile_sd (__m128d __A, __m128d __B)
     532  {
     533    return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
     534  }
     535  
     536  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     537  _mm_comigt_sd (__m128d __A, __m128d __B)
     538  {
     539    return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
     540  }
     541  
     542  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     543  _mm_comige_sd (__m128d __A, __m128d __B)
     544  {
     545    return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
     546  }
     547  
     548  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     549  _mm_comineq_sd (__m128d __A, __m128d __B)
     550  {
     551    return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
     552  }
     553  
     554  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     555  _mm_ucomieq_sd (__m128d __A, __m128d __B)
     556  {
     557    return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
     558  }
     559  
     560  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     561  _mm_ucomilt_sd (__m128d __A, __m128d __B)
     562  {
     563    return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
     564  }
     565  
     566  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     567  _mm_ucomile_sd (__m128d __A, __m128d __B)
     568  {
     569    return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
     570  }
     571  
     572  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     573  _mm_ucomigt_sd (__m128d __A, __m128d __B)
     574  {
     575    return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
     576  }
     577  
     578  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     579  _mm_ucomige_sd (__m128d __A, __m128d __B)
     580  {
     581    return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
     582  }
     583  
     584  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     585  _mm_ucomineq_sd (__m128d __A, __m128d __B)
     586  {
     587    return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
     588  }
     589  
     590  /* Create a vector of Qi, where i is the element number.  */
     591  
     592  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     593  _mm_set_epi64x (long long __q1, long long __q0)
     594  {
     595    return __extension__ (__m128i)(__v2di){ __q0, __q1 };
     596  }
     597  
     598  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     599  _mm_set_epi64 (__m64 __q1,  __m64 __q0)
     600  {
     601    return _mm_set_epi64x ((long long)__q1, (long long)__q0);
     602  }
     603  
     604  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     605  _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
     606  {
     607    return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
     608  }
     609  
     610  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     611  _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
     612  	       short __q3, short __q2, short __q1, short __q0)
     613  {
     614    return __extension__ (__m128i)(__v8hi){
     615      __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
     616  }
     617  
     618  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     619  _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
     620  	      char __q11, char __q10, char __q09, char __q08,
     621  	      char __q07, char __q06, char __q05, char __q04,
     622  	      char __q03, char __q02, char __q01, char __q00)
     623  {
     624    return __extension__ (__m128i)(__v16qi){
     625      __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
     626      __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
     627    };
     628  }
     629  
     630  /* Set all of the elements of the vector to A.  */
     631  
     632  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     633  _mm_set1_epi64x (long long __A)
     634  {
     635    return _mm_set_epi64x (__A, __A);
     636  }
     637  
     638  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     639  _mm_set1_epi64 (__m64 __A)
     640  {
     641    return _mm_set_epi64 (__A, __A);
     642  }
     643  
     644  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     645  _mm_set1_epi32 (int __A)
     646  {
     647    return _mm_set_epi32 (__A, __A, __A, __A);
     648  }
     649  
     650  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     651  _mm_set1_epi16 (short __A)
     652  {
     653    return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
     654  }
     655  
     656  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     657  _mm_set1_epi8 (char __A)
     658  {
     659    return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
     660  		       __A, __A, __A, __A, __A, __A, __A, __A);
     661  }
     662  
     663  /* Create a vector of Qi, where i is the element number.
     664     The parameter order is reversed from the _mm_set_epi* functions.  */
     665  
     666  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     667  _mm_setr_epi64 (__m64 __q0, __m64 __q1)
     668  {
     669    return _mm_set_epi64 (__q1, __q0);
     670  }
     671  
     672  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     673  _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
     674  {
     675    return _mm_set_epi32 (__q3, __q2, __q1, __q0);
     676  }
     677  
     678  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     679  _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
     680  	        short __q4, short __q5, short __q6, short __q7)
     681  {
     682    return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
     683  }
     684  
     685  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     686  _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
     687  	       char __q04, char __q05, char __q06, char __q07,
     688  	       char __q08, char __q09, char __q10, char __q11,
     689  	       char __q12, char __q13, char __q14, char __q15)
     690  {
     691    return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
     692  		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
     693  }
     694  
     695  /* Create a vector with element 0 as *P and the rest zero.  */
     696  
     697  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     698  _mm_load_si128 (__m128i const *__P)
     699  {
     700    return *__P;
     701  }
     702  
     703  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     704  _mm_loadu_si128 (__m128i_u const *__P)
     705  {
     706    return *__P;
     707  }
     708  
     709  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     710  _mm_loadl_epi64 (__m128i_u const *__P)
     711  {
     712    return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P);
     713  }
     714  
     715  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     716  _mm_loadu_si64 (void const *__P)
     717  {
     718    return _mm_loadl_epi64 ((__m128i_u *)__P);
     719  }
     720  
     721  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     722  _mm_loadu_si32 (void const *__P)
     723  {
     724    return _mm_set_epi32 (0, 0, 0, (*(__m32_u *)__P)[0]);
     725  }
     726  
     727  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     728  _mm_loadu_si16 (void const *__P)
     729  {
     730    return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, (*(__m16_u *)__P)[0]);
     731  }
     732  
     733  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     734  _mm_store_si128 (__m128i *__P, __m128i __B)
     735  {
     736    *__P = __B;
     737  }
     738  
     739  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     740  _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
     741  {
     742    *__P = __B;
     743  }
     744  
     745  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     746  _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
     747  {
     748    *(__m64_u *)__P = (__m64) ((__v2di)__B)[0];
     749  }
     750  
     751  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     752  _mm_storeu_si64 (void *__P, __m128i __B)
     753  {
     754    _mm_storel_epi64 ((__m128i_u *)__P, __B);
     755  }
     756  
     757  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     758  _mm_storeu_si32 (void *__P, __m128i __B)
     759  {
     760    *(__m32_u *)__P = (__m32) ((__v4si)__B)[0];
     761  }
     762  
     763  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     764  _mm_storeu_si16 (void *__P, __m128i __B)
     765  {
     766    *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0];
     767  }
     768  
     769  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     770  _mm_movepi64_pi64 (__m128i __B)
     771  {
     772    return (__m64) ((__v2di)__B)[0];
     773  }
     774  
     775  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     776  _mm_movpi64_epi64 (__m64 __A)
     777  {
     778    return _mm_set_epi64 ((__m64)0LL, __A);
     779  }
     780  
     781  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     782  _mm_move_epi64 (__m128i __A)
     783  {
     784    return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
     785  }
     786  
     787  /* Create an undefined vector.  */
     788  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     789  _mm_undefined_si128 (void)
     790  {
     791  #pragma GCC diagnostic push
     792  #pragma GCC diagnostic ignored "-Winit-self"
     793    __m128i __Y = __Y;
     794  #pragma GCC diagnostic pop
     795    return __Y;
     796  }
     797  
     798  /* Create a vector of zeros.  */
     799  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     800  _mm_setzero_si128 (void)
     801  {
     802    return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
     803  }
     804  
     805  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     806  _mm_cvtepi32_pd (__m128i __A)
     807  {
     808    return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
     809  }
     810  
     811  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     812  _mm_cvtepi32_ps (__m128i __A)
     813  {
     814    return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
     815  }
     816  
     817  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     818  _mm_cvtpd_epi32 (__m128d __A)
     819  {
     820    return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
     821  }
     822  
     823  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     824  _mm_cvtpd_pi32 (__m128d __A)
     825  {
     826    return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
     827  }
     828  
     829  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     830  _mm_cvtpd_ps (__m128d __A)
     831  {
     832    return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
     833  }
     834  
     835  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     836  _mm_cvttpd_epi32 (__m128d __A)
     837  {
     838    return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
     839  }
     840  
     841  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     842  _mm_cvttpd_pi32 (__m128d __A)
     843  {
     844    return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
     845  }
     846  
     847  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     848  _mm_cvtpi32_pd (__m64 __A)
     849  {
     850    return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
     851  }
     852  
     853  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     854  _mm_cvtps_epi32 (__m128 __A)
     855  {
     856    return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
     857  }
     858  
     859  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     860  _mm_cvttps_epi32 (__m128 __A)
     861  {
     862    return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
     863  }
     864  
     865  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     866  _mm_cvtps_pd (__m128 __A)
     867  {
     868    return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
     869  }
     870  
     871  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     872  _mm_cvtsd_si32 (__m128d __A)
     873  {
     874    return __builtin_ia32_cvtsd2si ((__v2df) __A);
     875  }
     876  
     877  #ifdef __x86_64__
     878  /* Intel intrinsic.  */
     879  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     880  _mm_cvtsd_si64 (__m128d __A)
     881  {
     882    return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
     883  }
     884  
     885  /* Microsoft intrinsic.  */
     886  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     887  _mm_cvtsd_si64x (__m128d __A)
     888  {
     889    return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
     890  }
     891  #endif
     892  
     893  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     894  _mm_cvttsd_si32 (__m128d __A)
     895  {
     896    return __builtin_ia32_cvttsd2si ((__v2df) __A);
     897  }
     898  
     899  #ifdef __x86_64__
     900  /* Intel intrinsic.  */
     901  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     902  _mm_cvttsd_si64 (__m128d __A)
     903  {
     904    return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
     905  }
     906  
     907  /* Microsoft intrinsic.  */
     908  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     909  _mm_cvttsd_si64x (__m128d __A)
     910  {
     911    return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
     912  }
     913  #endif
     914  
     915  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     916  _mm_cvtsd_ss (__m128 __A, __m128d __B)
     917  {
     918    return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
     919  }
     920  
     921  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     922  _mm_cvtsi32_sd (__m128d __A, int __B)
     923  {
     924    return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
     925  }
     926  
     927  #ifdef __x86_64__
     928  /* Intel intrinsic.  */
     929  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     930  _mm_cvtsi64_sd (__m128d __A, long long __B)
     931  {
     932    return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
     933  }
     934  
     935  /* Microsoft intrinsic.  */
     936  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     937  _mm_cvtsi64x_sd (__m128d __A, long long __B)
     938  {
     939    return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
     940  }
     941  #endif
     942  
     943  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     944  _mm_cvtss_sd (__m128d __A, __m128 __B)
     945  {
     946    return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
     947  }
     948  
     949  #ifdef __OPTIMIZE__
     950  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     951  _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
     952  {
     953    return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
     954  }
     955  #else
     956  #define _mm_shuffle_pd(A, B, N)						\
     957    ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A),		\
     958  				   (__v2df)(__m128d)(B), (int)(N)))
     959  #endif
     960  
     961  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     962  _mm_unpackhi_pd (__m128d __A, __m128d __B)
     963  {
     964    return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
     965  }
     966  
     967  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     968  _mm_unpacklo_pd (__m128d __A, __m128d __B)
     969  {
     970    return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
     971  }
     972  
     973  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     974  _mm_loadh_pd (__m128d __A, double const *__B)
     975  {
     976    return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
     977  }
     978  
     979  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     980  _mm_loadl_pd (__m128d __A, double const *__B)
     981  {
     982    return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
     983  }
     984  
     985  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     986  _mm_movemask_pd (__m128d __A)
     987  {
     988    return __builtin_ia32_movmskpd ((__v2df)__A);
     989  }
     990  
     991  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     992  _mm_packs_epi16 (__m128i __A, __m128i __B)
     993  {
     994    return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
     995  }
     996  
     997  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     998  _mm_packs_epi32 (__m128i __A, __m128i __B)
     999  {
    1000    return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
    1001  }
    1002  
    1003  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1004  _mm_packus_epi16 (__m128i __A, __m128i __B)
    1005  {
    1006    return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
    1007  }
    1008  
    1009  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1010  _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
    1011  {
    1012    return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
    1013  }
    1014  
    1015  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1016  _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
    1017  {
    1018    return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
    1019  }
    1020  
    1021  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1022  _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
    1023  {
    1024    return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
    1025  }
    1026  
    1027  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1028  _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
    1029  {
    1030    return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
    1031  }
    1032  
    1033  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1034  _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
    1035  {
    1036    return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
    1037  }
    1038  
    1039  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1040  _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
    1041  {
    1042    return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
    1043  }
    1044  
    1045  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1046  _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
    1047  {
    1048    return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
    1049  }
    1050  
    1051  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1052  _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
    1053  {
    1054    return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
    1055  }
    1056  
    1057  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1058  _mm_add_epi8 (__m128i __A, __m128i __B)
    1059  {
    1060    return (__m128i) ((__v16qu)__A + (__v16qu)__B);
    1061  }
    1062  
    1063  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1064  _mm_add_epi16 (__m128i __A, __m128i __B)
    1065  {
    1066    return (__m128i) ((__v8hu)__A + (__v8hu)__B);
    1067  }
    1068  
    1069  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1070  _mm_add_epi32 (__m128i __A, __m128i __B)
    1071  {
    1072    return (__m128i) ((__v4su)__A + (__v4su)__B);
    1073  }
    1074  
    1075  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1076  _mm_add_epi64 (__m128i __A, __m128i __B)
    1077  {
    1078    return (__m128i) ((__v2du)__A + (__v2du)__B);
    1079  }
    1080  
    1081  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1082  _mm_adds_epi8 (__m128i __A, __m128i __B)
    1083  {
    1084    return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
    1085  }
    1086  
    1087  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1088  _mm_adds_epi16 (__m128i __A, __m128i __B)
    1089  {
    1090    return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
    1091  }
    1092  
    1093  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1094  _mm_adds_epu8 (__m128i __A, __m128i __B)
    1095  {
    1096    return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
    1097  }
    1098  
    1099  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1100  _mm_adds_epu16 (__m128i __A, __m128i __B)
    1101  {
    1102    return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
    1103  }
    1104  
    1105  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1106  _mm_sub_epi8 (__m128i __A, __m128i __B)
    1107  {
    1108    return (__m128i) ((__v16qu)__A - (__v16qu)__B);
    1109  }
    1110  
    1111  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1112  _mm_sub_epi16 (__m128i __A, __m128i __B)
    1113  {
    1114    return (__m128i) ((__v8hu)__A - (__v8hu)__B);
    1115  }
    1116  
    1117  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1118  _mm_sub_epi32 (__m128i __A, __m128i __B)
    1119  {
    1120    return (__m128i) ((__v4su)__A - (__v4su)__B);
    1121  }
    1122  
    1123  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1124  _mm_sub_epi64 (__m128i __A, __m128i __B)
    1125  {
    1126    return (__m128i) ((__v2du)__A - (__v2du)__B);
    1127  }
    1128  
    1129  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1130  _mm_subs_epi8 (__m128i __A, __m128i __B)
    1131  {
    1132    return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
    1133  }
    1134  
    1135  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1136  _mm_subs_epi16 (__m128i __A, __m128i __B)
    1137  {
    1138    return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
    1139  }
    1140  
    1141  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1142  _mm_subs_epu8 (__m128i __A, __m128i __B)
    1143  {
    1144    return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
    1145  }
    1146  
    1147  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1148  _mm_subs_epu16 (__m128i __A, __m128i __B)
    1149  {
    1150    return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
    1151  }
    1152  
    1153  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1154  _mm_madd_epi16 (__m128i __A, __m128i __B)
    1155  {
    1156    return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
    1157  }
    1158  
    1159  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1160  _mm_mulhi_epi16 (__m128i __A, __m128i __B)
    1161  {
    1162    return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
    1163  }
    1164  
    1165  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1166  _mm_mullo_epi16 (__m128i __A, __m128i __B)
    1167  {
    1168    return (__m128i) ((__v8hu)__A * (__v8hu)__B);
    1169  }
    1170  
    1171  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1172  _mm_mul_su32 (__m64 __A, __m64 __B)
    1173  {
    1174    return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
    1175  }
    1176  
    1177  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1178  _mm_mul_epu32 (__m128i __A, __m128i __B)
    1179  {
    1180    return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
    1181  }
    1182  
    1183  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1184  _mm_slli_epi16 (__m128i __A, int __B)
    1185  {
    1186    return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
    1187  }
    1188  
    1189  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1190  _mm_slli_epi32 (__m128i __A, int __B)
    1191  {
    1192    return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
    1193  }
    1194  
    1195  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1196  _mm_slli_epi64 (__m128i __A, int __B)
    1197  {
    1198    return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
    1199  }
    1200  
    1201  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1202  _mm_srai_epi16 (__m128i __A, int __B)
    1203  {
    1204    return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
    1205  }
    1206  
    1207  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1208  _mm_srai_epi32 (__m128i __A, int __B)
    1209  {
    1210    return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
    1211  }
    1212  
    1213  #ifdef __OPTIMIZE__
    1214  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1215  _mm_bsrli_si128 (__m128i __A, const int __N)
    1216  {
    1217    return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
    1218  }
    1219  
    1220  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1221  _mm_bslli_si128 (__m128i __A, const int __N)
    1222  {
    1223    return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
    1224  }
    1225  
    1226  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1227  _mm_srli_si128 (__m128i __A, const int __N)
    1228  {
    1229    return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
    1230  }
    1231  
    1232  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1233  _mm_slli_si128 (__m128i __A, const int __N)
    1234  {
    1235    return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
    1236  }
    1237  #else
    1238  #define _mm_bsrli_si128(A, N) \
    1239    ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
    1240  #define _mm_bslli_si128(A, N) \
    1241    ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
    1242  #define _mm_srli_si128(A, N) \
    1243    ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
    1244  #define _mm_slli_si128(A, N) \
    1245    ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
    1246  #endif
    1247  
    1248  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1249  _mm_srli_epi16 (__m128i __A, int __B)
    1250  {
    1251    return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
    1252  }
    1253  
    1254  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1255  _mm_srli_epi32 (__m128i __A, int __B)
    1256  {
    1257    return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
    1258  }
    1259  
    1260  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1261  _mm_srli_epi64 (__m128i __A, int __B)
    1262  {
    1263    return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
    1264  }
    1265  
    1266  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1267  _mm_sll_epi16 (__m128i __A, __m128i __B)
    1268  {
    1269    return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
    1270  }
    1271  
    1272  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1273  _mm_sll_epi32 (__m128i __A, __m128i __B)
    1274  {
    1275    return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
    1276  }
    1277  
    1278  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1279  _mm_sll_epi64 (__m128i __A, __m128i __B)
    1280  {
    1281    return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
    1282  }
    1283  
    1284  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1285  _mm_sra_epi16 (__m128i __A, __m128i __B)
    1286  {
    1287    return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
    1288  }
    1289  
    1290  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1291  _mm_sra_epi32 (__m128i __A, __m128i __B)
    1292  {
    1293    return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
    1294  }
    1295  
    1296  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1297  _mm_srl_epi16 (__m128i __A, __m128i __B)
    1298  {
    1299    return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
    1300  }
    1301  
    1302  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1303  _mm_srl_epi32 (__m128i __A, __m128i __B)
    1304  {
    1305    return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
    1306  }
    1307  
    1308  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1309  _mm_srl_epi64 (__m128i __A, __m128i __B)
    1310  {
    1311    return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
    1312  }
    1313  
    1314  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1315  _mm_and_si128 (__m128i __A, __m128i __B)
    1316  {
    1317    return (__m128i) ((__v2du)__A & (__v2du)__B);
    1318  }
    1319  
    1320  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1321  _mm_andnot_si128 (__m128i __A, __m128i __B)
    1322  {
    1323    return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
    1324  }
    1325  
    1326  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1327  _mm_or_si128 (__m128i __A, __m128i __B)
    1328  {
    1329    return (__m128i) ((__v2du)__A | (__v2du)__B);
    1330  }
    1331  
    1332  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1333  _mm_xor_si128 (__m128i __A, __m128i __B)
    1334  {
    1335    return (__m128i) ((__v2du)__A ^ (__v2du)__B);
    1336  }
    1337  
    1338  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1339  _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
    1340  {
    1341    return (__m128i) ((__v16qi)__A == (__v16qi)__B);
    1342  }
    1343  
    1344  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1345  _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
    1346  {
    1347    return (__m128i) ((__v8hi)__A == (__v8hi)__B);
    1348  }
    1349  
    1350  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1351  _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
    1352  {
    1353    return (__m128i) ((__v4si)__A == (__v4si)__B);
    1354  }
    1355  
    1356  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1357  _mm_cmplt_epi8 (__m128i __A, __m128i __B)
    1358  {
    1359    return (__m128i) ((__v16qs)__A < (__v16qs)__B);
    1360  }
    1361  
    1362  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1363  _mm_cmplt_epi16 (__m128i __A, __m128i __B)
    1364  {
    1365    return (__m128i) ((__v8hi)__A < (__v8hi)__B);
    1366  }
    1367  
    1368  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1369  _mm_cmplt_epi32 (__m128i __A, __m128i __B)
    1370  {
    1371    return (__m128i) ((__v4si)__A < (__v4si)__B);
    1372  }
    1373  
    1374  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1375  _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
    1376  {
    1377    return (__m128i) ((__v16qs)__A > (__v16qs)__B);
    1378  }
    1379  
    1380  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1381  _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
    1382  {
    1383    return (__m128i) ((__v8hi)__A > (__v8hi)__B);
    1384  }
    1385  
    1386  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1387  _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
    1388  {
    1389    return (__m128i) ((__v4si)__A > (__v4si)__B);
    1390  }
    1391  
    1392  #ifdef __OPTIMIZE__
    1393  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1394  _mm_extract_epi16 (__m128i const __A, int const __N)
    1395  {
    1396    return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
    1397  }
    1398  
    1399  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1400  _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
    1401  {
    1402    return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
    1403  }
    1404  #else
    1405  #define _mm_extract_epi16(A, N) \
    1406    ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
    1407  #define _mm_insert_epi16(A, D, N)				\
    1408    ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),	\
    1409  					  (int)(D), (int)(N)))
    1410  #endif
    1411  
    1412  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1413  _mm_max_epi16 (__m128i __A, __m128i __B)
    1414  {
    1415    return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
    1416  }
    1417  
    1418  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1419  _mm_max_epu8 (__m128i __A, __m128i __B)
    1420  {
    1421    return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
    1422  }
    1423  
    1424  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1425  _mm_min_epi16 (__m128i __A, __m128i __B)
    1426  {
    1427    return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
    1428  }
    1429  
    1430  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1431  _mm_min_epu8 (__m128i __A, __m128i __B)
    1432  {
    1433    return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
    1434  }
    1435  
    1436  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1437  _mm_movemask_epi8 (__m128i __A)
    1438  {
    1439    return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
    1440  }
    1441  
    1442  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1443  _mm_mulhi_epu16 (__m128i __A, __m128i __B)
    1444  {
    1445    return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
    1446  }
    1447  
    1448  #ifdef __OPTIMIZE__
    1449  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1450  _mm_shufflehi_epi16 (__m128i __A, const int __mask)
    1451  {
    1452    return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
    1453  }
    1454  
    1455  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1456  _mm_shufflelo_epi16 (__m128i __A, const int __mask)
    1457  {
    1458    return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
    1459  }
    1460  
    1461  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1462  _mm_shuffle_epi32 (__m128i __A, const int __mask)
    1463  {
    1464    return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
    1465  }
    1466  #else
    1467  #define _mm_shufflehi_epi16(A, N) \
    1468    ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
    1469  #define _mm_shufflelo_epi16(A, N) \
    1470    ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
    1471  #define _mm_shuffle_epi32(A, N) \
    1472    ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
    1473  #endif
    1474  
    1475  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1476  _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
    1477  {
    1478    __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
    1479  }
    1480  
    1481  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1482  _mm_avg_epu8 (__m128i __A, __m128i __B)
    1483  {
    1484    return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
    1485  }
    1486  
    1487  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1488  _mm_avg_epu16 (__m128i __A, __m128i __B)
    1489  {
    1490    return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
    1491  }
    1492  
    1493  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1494  _mm_sad_epu8 (__m128i __A, __m128i __B)
    1495  {
    1496    return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
    1497  }
    1498  
    1499  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1500  _mm_stream_si32 (int *__A, int __B)
    1501  {
    1502    __builtin_ia32_movnti (__A, __B);
    1503  }
    1504  
    1505  #ifdef __x86_64__
    1506  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1507  _mm_stream_si64 (long long int *__A, long long int __B)
    1508  {
    1509    __builtin_ia32_movnti64 (__A, __B);
    1510  }
    1511  #endif
    1512  
    1513  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1514  _mm_stream_si128 (__m128i *__A, __m128i __B)
    1515  {
    1516    __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
    1517  }
    1518  
    1519  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1520  _mm_stream_pd (double *__A, __m128d __B)
    1521  {
    1522    __builtin_ia32_movntpd (__A, (__v2df)__B);
    1523  }
    1524  
    1525  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1526  _mm_clflush (void const *__A)
    1527  {
    1528    __builtin_ia32_clflush (__A);
    1529  }
    1530  
    1531  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1532  _mm_lfence (void)
    1533  {
    1534    __builtin_ia32_lfence ();
    1535  }
    1536  
    1537  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1538  _mm_mfence (void)
    1539  {
    1540    __builtin_ia32_mfence ();
    1541  }
    1542  
    1543  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1544  _mm_cvtsi32_si128 (int __A)
    1545  {
    1546    return _mm_set_epi32 (0, 0, 0, __A);
    1547  }
    1548  
    1549  #ifdef __x86_64__
    1550  /* Intel intrinsic.  */
    1551  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1552  _mm_cvtsi64_si128 (long long __A)
    1553  {
    1554    return _mm_set_epi64x (0, __A);
    1555  }
    1556  
    1557  /* Microsoft intrinsic.  */
    1558  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1559  _mm_cvtsi64x_si128 (long long __A)
    1560  {
    1561    return _mm_set_epi64x (0, __A);
    1562  }
    1563  #endif
    1564  
    1565  /* Casts between various SP, DP, INT vector types.  Note that these do no
    1566     conversion of values, they just change the type.  */
    1567  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1568  _mm_castpd_ps(__m128d __A)
    1569  {
    1570    return (__m128) __A;
    1571  }
    1572  
    1573  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1574  _mm_castpd_si128(__m128d __A)
    1575  {
    1576    return (__m128i) __A;
    1577  }
    1578  
    1579  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1580  _mm_castps_pd(__m128 __A)
    1581  {
    1582    return (__m128d) __A;
    1583  }
    1584  
    1585  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1586  _mm_castps_si128(__m128 __A)
    1587  {
    1588    return (__m128i) __A;
    1589  }
    1590  
    1591  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1592  _mm_castsi128_ps(__m128i __A)
    1593  {
    1594    return (__m128) __A;
    1595  }
    1596  
    1597  extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1598  _mm_castsi128_pd(__m128i __A)
    1599  {
    1600    return (__m128d) __A;
    1601  }
    1602  
    1603  #ifdef __DISABLE_SSE2__
    1604  #undef __DISABLE_SSE2__
    1605  #pragma GCC pop_options
    1606  #endif /* __DISABLE_SSE2__ */
    1607  
    1608  #endif /* _EMMINTRIN_H_INCLUDED */