1  /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3, or (at your option)
       8     any later version.
       9  
      10     GCC is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  /* Implemented from the specification included in the Intel C++ Compiler
      25     User Guide and Reference, version 9.0.  */
      26  
      27  #ifndef _XMMINTRIN_H_INCLUDED
      28  #define _XMMINTRIN_H_INCLUDED
      29  
      30  /* We need type definitions from the MMX header file.  */
      31  #include <mmintrin.h>
      32  
      33  /* Get _mm_malloc () and _mm_free ().  */
      34  #include <mm_malloc.h>
      35  
      36  /* Constants for use with _mm_prefetch.  */
      37  enum _mm_hint
      38  {
      39    _MM_HINT_IT0 = 19,
      40    _MM_HINT_IT1 = 18,
      41    /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
      42    _MM_HINT_ET0 = 7,
      43    _MM_HINT_ET1 = 6,
      44    _MM_HINT_T0 = 3,
      45    _MM_HINT_T1 = 2,
      46    _MM_HINT_T2 = 1,
      47    _MM_HINT_NTA = 0
      48  };
      49  
      50  /* Loads one cache line from address P to a location "closer" to the
      51     processor.  The selector I specifies the type of prefetch operation.  */
      52  #ifdef __OPTIMIZE__
      53  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      54  _mm_prefetch (const void *__P, enum _mm_hint __I)
      55  {
      56    __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
      57  			   __I & 0x3, (__I & 0x10) >> 4);
      58  }
      59  #else
      60  #define _mm_prefetch(P, I) \
      61    __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
      62  #endif
      63  
      64  #ifndef __SSE__
      65  #pragma GCC push_options
      66  #pragma GCC target("sse")
      67  #define __DISABLE_SSE__
      68  #endif /* __SSE__ */
      69  
      70  /* The Intel API is flexible enough that we must allow aliasing with other
      71     vector types, and their scalar components.  */
      72  typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
      73  
      74  /* Unaligned version of the same type.  */
      75  typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
      76  
      77  /* Internal data types for implementing the intrinsics.  */
      78  typedef float __v4sf __attribute__ ((__vector_size__ (16)));
      79  
      80  /* Create a selector for use with the SHUFPS instruction.  */
      81  #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
      82   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
      83  
      84  /* Bits in the MXCSR.  */
      85  #define _MM_EXCEPT_MASK       0x003f
      86  #define _MM_EXCEPT_INVALID    0x0001
      87  #define _MM_EXCEPT_DENORM     0x0002
      88  #define _MM_EXCEPT_DIV_ZERO   0x0004
      89  #define _MM_EXCEPT_OVERFLOW   0x0008
      90  #define _MM_EXCEPT_UNDERFLOW  0x0010
      91  #define _MM_EXCEPT_INEXACT    0x0020
      92  
      93  #define _MM_MASK_MASK         0x1f80
      94  #define _MM_MASK_INVALID      0x0080
      95  #define _MM_MASK_DENORM       0x0100
      96  #define _MM_MASK_DIV_ZERO     0x0200
      97  #define _MM_MASK_OVERFLOW     0x0400
      98  #define _MM_MASK_UNDERFLOW    0x0800
      99  #define _MM_MASK_INEXACT      0x1000
     100  
     101  #define _MM_ROUND_MASK        0x6000
     102  #define _MM_ROUND_NEAREST     0x0000
     103  #define _MM_ROUND_DOWN        0x2000
     104  #define _MM_ROUND_UP          0x4000
     105  #define _MM_ROUND_TOWARD_ZERO 0x6000
     106  
     107  #define _MM_FLUSH_ZERO_MASK   0x8000
     108  #define _MM_FLUSH_ZERO_ON     0x8000
     109  #define _MM_FLUSH_ZERO_OFF    0x0000
     110  
     111  /* Create an undefined vector.  */
     112  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     113  _mm_undefined_ps (void)
     114  {
     115  #pragma GCC diagnostic push
     116  #pragma GCC diagnostic ignored "-Winit-self"
     117    __m128 __Y = __Y;
     118  #pragma GCC diagnostic pop
     119    return __Y;
     120  }
     121  
     122  /* Create a vector of zeros.  */
     123  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     124  _mm_setzero_ps (void)
     125  {
     126    return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
     127  }
     128  
     129  /* Perform the respective operation on the lower SPFP (single-precision
     130     floating-point) values of A and B; the upper three SPFP values are
     131     passed through from A.  */
     132  
     133  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     134  _mm_add_ss (__m128 __A, __m128 __B)
     135  {
     136    return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
     137  }
     138  
     139  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     140  _mm_sub_ss (__m128 __A, __m128 __B)
     141  {
     142    return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
     143  }
     144  
     145  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     146  _mm_mul_ss (__m128 __A, __m128 __B)
     147  {
     148    return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
     149  }
     150  
     151  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     152  _mm_div_ss (__m128 __A, __m128 __B)
     153  {
     154    return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
     155  }
     156  
     157  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     158  _mm_sqrt_ss (__m128 __A)
     159  {
     160    return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
     161  }
     162  
     163  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     164  _mm_rcp_ss (__m128 __A)
     165  {
     166    return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
     167  }
     168  
     169  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     170  _mm_rsqrt_ss (__m128 __A)
     171  {
     172    return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
     173  }
     174  
     175  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     176  _mm_min_ss (__m128 __A, __m128 __B)
     177  {
     178    return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
     179  }
     180  
     181  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     182  _mm_max_ss (__m128 __A, __m128 __B)
     183  {
     184    return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
     185  }
     186  
     187  /* Perform the respective operation on the four SPFP values in A and B.  */
     188  
     189  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     190  _mm_add_ps (__m128 __A, __m128 __B)
     191  {
     192    return (__m128) ((__v4sf)__A + (__v4sf)__B);
     193  }
     194  
     195  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     196  _mm_sub_ps (__m128 __A, __m128 __B)
     197  {
     198    return (__m128) ((__v4sf)__A - (__v4sf)__B);
     199  }
     200  
     201  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     202  _mm_mul_ps (__m128 __A, __m128 __B)
     203  {
     204    return (__m128) ((__v4sf)__A * (__v4sf)__B);
     205  }
     206  
     207  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     208  _mm_div_ps (__m128 __A, __m128 __B)
     209  {
     210    return (__m128) ((__v4sf)__A / (__v4sf)__B);
     211  }
     212  
     213  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     214  _mm_sqrt_ps (__m128 __A)
     215  {
     216    return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
     217  }
     218  
     219  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     220  _mm_rcp_ps (__m128 __A)
     221  {
     222    return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
     223  }
     224  
     225  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     226  _mm_rsqrt_ps (__m128 __A)
     227  {
     228    return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
     229  }
     230  
     231  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     232  _mm_min_ps (__m128 __A, __m128 __B)
     233  {
     234    return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
     235  }
     236  
     237  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     238  _mm_max_ps (__m128 __A, __m128 __B)
     239  {
     240    return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
     241  }
     242  
     243  /* Perform logical bit-wise operations on 128-bit values.  */
     244  
     245  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     246  _mm_and_ps (__m128 __A, __m128 __B)
     247  {
     248    return __builtin_ia32_andps (__A, __B);
     249  }
     250  
     251  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     252  _mm_andnot_ps (__m128 __A, __m128 __B)
     253  {
     254    return __builtin_ia32_andnps (__A, __B);
     255  }
     256  
     257  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     258  _mm_or_ps (__m128 __A, __m128 __B)
     259  {
     260    return __builtin_ia32_orps (__A, __B);
     261  }
     262  
     263  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     264  _mm_xor_ps (__m128 __A, __m128 __B)
     265  {
     266    return __builtin_ia32_xorps (__A, __B);
     267  }
     268  
     269  /* Perform a comparison on the lower SPFP values of A and B.  If the
     270     comparison is true, place a mask of all ones in the result, otherwise a
     271     mask of zeros.  The upper three SPFP values are passed through from A.  */
     272  
     273  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     274  _mm_cmpeq_ss (__m128 __A, __m128 __B)
     275  {
     276    return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
     277  }
     278  
     279  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     280  _mm_cmplt_ss (__m128 __A, __m128 __B)
     281  {
     282    return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
     283  }
     284  
     285  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     286  _mm_cmple_ss (__m128 __A, __m128 __B)
     287  {
     288    return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
     289  }
     290  
     291  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     292  _mm_cmpgt_ss (__m128 __A, __m128 __B)
     293  {
     294    return (__m128) __builtin_ia32_movss ((__v4sf) __A,
     295  					(__v4sf)
     296  					__builtin_ia32_cmpltss ((__v4sf) __B,
     297  								(__v4sf)
     298  								__A));
     299  }
     300  
     301  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     302  _mm_cmpge_ss (__m128 __A, __m128 __B)
     303  {
     304    return (__m128) __builtin_ia32_movss ((__v4sf) __A,
     305  					(__v4sf)
     306  					__builtin_ia32_cmpless ((__v4sf) __B,
     307  								(__v4sf)
     308  								__A));
     309  }
     310  
     311  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     312  _mm_cmpneq_ss (__m128 __A, __m128 __B)
     313  {
     314    return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
     315  }
     316  
     317  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     318  _mm_cmpnlt_ss (__m128 __A, __m128 __B)
     319  {
     320    return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
     321  }
     322  
     323  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     324  _mm_cmpnle_ss (__m128 __A, __m128 __B)
     325  {
     326    return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
     327  }
     328  
     329  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     330  _mm_cmpngt_ss (__m128 __A, __m128 __B)
     331  {
     332    return (__m128) __builtin_ia32_movss ((__v4sf) __A,
     333  					(__v4sf)
     334  					__builtin_ia32_cmpnltss ((__v4sf) __B,
     335  								 (__v4sf)
     336  								 __A));
     337  }
     338  
     339  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     340  _mm_cmpnge_ss (__m128 __A, __m128 __B)
     341  {
     342    return (__m128) __builtin_ia32_movss ((__v4sf) __A,
     343  					(__v4sf)
     344  					__builtin_ia32_cmpnless ((__v4sf) __B,
     345  								 (__v4sf)
     346  								 __A));
     347  }
     348  
     349  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     350  _mm_cmpord_ss (__m128 __A, __m128 __B)
     351  {
     352    return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
     353  }
     354  
     355  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     356  _mm_cmpunord_ss (__m128 __A, __m128 __B)
     357  {
     358    return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
     359  }
     360  
     361  /* Perform a comparison on the four SPFP values of A and B.  For each
     362     element, if the comparison is true, place a mask of all ones in the
     363     result, otherwise a mask of zeros.  */
     364  
     365  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     366  _mm_cmpeq_ps (__m128 __A, __m128 __B)
     367  {
     368    return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
     369  }
     370  
     371  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     372  _mm_cmplt_ps (__m128 __A, __m128 __B)
     373  {
     374    return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
     375  }
     376  
     377  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     378  _mm_cmple_ps (__m128 __A, __m128 __B)
     379  {
     380    return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
     381  }
     382  
     383  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     384  _mm_cmpgt_ps (__m128 __A, __m128 __B)
     385  {
     386    return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
     387  }
     388  
     389  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     390  _mm_cmpge_ps (__m128 __A, __m128 __B)
     391  {
     392    return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
     393  }
     394  
     395  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     396  _mm_cmpneq_ps (__m128 __A, __m128 __B)
     397  {
     398    return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
     399  }
     400  
     401  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     402  _mm_cmpnlt_ps (__m128 __A, __m128 __B)
     403  {
     404    return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
     405  }
     406  
     407  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     408  _mm_cmpnle_ps (__m128 __A, __m128 __B)
     409  {
     410    return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
     411  }
     412  
     413  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     414  _mm_cmpngt_ps (__m128 __A, __m128 __B)
     415  {
     416    return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
     417  }
     418  
     419  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     420  _mm_cmpnge_ps (__m128 __A, __m128 __B)
     421  {
     422    return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
     423  }
     424  
     425  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     426  _mm_cmpord_ps (__m128 __A, __m128 __B)
     427  {
     428    return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
     429  }
     430  
     431  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     432  _mm_cmpunord_ps (__m128 __A, __m128 __B)
     433  {
     434    return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
     435  }
     436  
     437  /* Compare the lower SPFP values of A and B and return 1 if true
     438     and 0 if false.  */
     439  
     440  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     441  _mm_comieq_ss (__m128 __A, __m128 __B)
     442  {
     443    return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
     444  }
     445  
     446  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     447  _mm_comilt_ss (__m128 __A, __m128 __B)
     448  {
     449    return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
     450  }
     451  
     452  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     453  _mm_comile_ss (__m128 __A, __m128 __B)
     454  {
     455    return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
     456  }
     457  
     458  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     459  _mm_comigt_ss (__m128 __A, __m128 __B)
     460  {
     461    return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
     462  }
     463  
     464  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     465  _mm_comige_ss (__m128 __A, __m128 __B)
     466  {
     467    return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
     468  }
     469  
     470  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     471  _mm_comineq_ss (__m128 __A, __m128 __B)
     472  {
     473    return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
     474  }
     475  
     476  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     477  _mm_ucomieq_ss (__m128 __A, __m128 __B)
     478  {
     479    return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
     480  }
     481  
     482  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     483  _mm_ucomilt_ss (__m128 __A, __m128 __B)
     484  {
     485    return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
     486  }
     487  
     488  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     489  _mm_ucomile_ss (__m128 __A, __m128 __B)
     490  {
     491    return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
     492  }
     493  
     494  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     495  _mm_ucomigt_ss (__m128 __A, __m128 __B)
     496  {
     497    return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
     498  }
     499  
     500  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     501  _mm_ucomige_ss (__m128 __A, __m128 __B)
     502  {
     503    return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
     504  }
     505  
     506  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     507  _mm_ucomineq_ss (__m128 __A, __m128 __B)
     508  {
     509    return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
     510  }
     511  
     512  /* Convert the lower SPFP value to a 32-bit integer according to the current
     513     rounding mode.  */
     514  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     515  _mm_cvtss_si32 (__m128 __A)
     516  {
     517    return __builtin_ia32_cvtss2si ((__v4sf) __A);
     518  }
     519  
     520  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     521  _mm_cvt_ss2si (__m128 __A)
     522  {
     523    return _mm_cvtss_si32 (__A);
     524  }
     525  
     526  #ifdef __x86_64__
     527  /* Convert the lower SPFP value to a 32-bit integer according to the
     528     current rounding mode.  */
     529  
     530  /* Intel intrinsic.  */
     531  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     532  _mm_cvtss_si64 (__m128 __A)
     533  {
     534    return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
     535  }
     536  
     537  /* Microsoft intrinsic.  */
     538  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     539  _mm_cvtss_si64x (__m128 __A)
     540  {
     541    return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
     542  }
     543  #endif
     544  
     545  /* Convert the two lower SPFP values to 32-bit integers according to the
     546     current rounding mode.  Return the integers in packed form.  */
     547  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     548  _mm_cvtps_pi32 (__m128 __A)
     549  {
     550    return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
     551  }
     552  
     553  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     554  _mm_cvt_ps2pi (__m128 __A)
     555  {
     556    return _mm_cvtps_pi32 (__A);
     557  }
     558  
     559  /* Truncate the lower SPFP value to a 32-bit integer.  */
     560  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     561  _mm_cvttss_si32 (__m128 __A)
     562  {
     563    return __builtin_ia32_cvttss2si ((__v4sf) __A);
     564  }
     565  
     566  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     567  _mm_cvtt_ss2si (__m128 __A)
     568  {
     569    return _mm_cvttss_si32 (__A);
     570  }
     571  
     572  #ifdef __x86_64__
     573  /* Truncate the lower SPFP value to a 32-bit integer.  */
     574  
     575  /* Intel intrinsic.  */
     576  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     577  _mm_cvttss_si64 (__m128 __A)
     578  {
     579    return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
     580  }
     581  
     582  /* Microsoft intrinsic.  */
     583  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     584  _mm_cvttss_si64x (__m128 __A)
     585  {
     586    return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
     587  }
     588  #endif
     589  
     590  /* Truncate the two lower SPFP values to 32-bit integers.  Return the
     591     integers in packed form.  */
     592  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     593  _mm_cvttps_pi32 (__m128 __A)
     594  {
     595    return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
     596  }
     597  
     598  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     599  _mm_cvtt_ps2pi (__m128 __A)
     600  {
     601    return _mm_cvttps_pi32 (__A);
     602  }
     603  
     604  /* Convert B to a SPFP value and insert it as element zero in A.  */
     605  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     606  _mm_cvtsi32_ss (__m128 __A, int __B)
     607  {
     608    return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
     609  }
     610  
     611  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     612  _mm_cvt_si2ss (__m128 __A, int __B)
     613  {
     614    return _mm_cvtsi32_ss (__A, __B);
     615  }
     616  
     617  #ifdef __x86_64__
     618  /* Convert B to a SPFP value and insert it as element zero in A.  */
     619  
     620  /* Intel intrinsic.  */
     621  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     622  _mm_cvtsi64_ss (__m128 __A, long long __B)
     623  {
     624    return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
     625  }
     626  
     627  /* Microsoft intrinsic.  */
     628  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     629  _mm_cvtsi64x_ss (__m128 __A, long long __B)
     630  {
     631    return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
     632  }
     633  #endif
     634  
     635  /* Convert the two 32-bit values in B to SPFP form and insert them
     636     as the two lower elements in A.  */
     637  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     638  _mm_cvtpi32_ps (__m128 __A, __m64 __B)
     639  {
     640    return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
     641  }
     642  
     643  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     644  _mm_cvt_pi2ps (__m128 __A, __m64 __B)
     645  {
     646    return _mm_cvtpi32_ps (__A, __B);
     647  }
     648  
     649  /* Convert the four signed 16-bit values in A to SPFP form.  */
     650  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     651  _mm_cvtpi16_ps (__m64 __A)
     652  {
     653    __v4hi __sign;
     654    __v2si __hisi, __losi;
     655    __v4sf __zero, __ra, __rb;
     656  
     657    /* This comparison against zero gives us a mask that can be used to
     658       fill in the missing sign bits in the unpack operations below, so
     659       that we get signed values after unpacking.  */
     660    __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
     661  
     662    /* Convert the four words to doublewords.  */
     663    __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
     664    __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
     665  
     666    /* Convert the doublewords to floating point two at a time.  */
     667    __zero = (__v4sf) _mm_setzero_ps ();
     668    __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
     669    __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
     670  
     671    return (__m128) __builtin_ia32_movlhps (__ra, __rb);
     672  }
     673  
     674  /* Convert the four unsigned 16-bit values in A to SPFP form.  */
     675  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     676  _mm_cvtpu16_ps (__m64 __A)
     677  {
     678    __v2si __hisi, __losi;
     679    __v4sf __zero, __ra, __rb;
     680  
     681    /* Convert the four words to doublewords.  */
     682    __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
     683    __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
     684  
     685    /* Convert the doublewords to floating point two at a time.  */
     686    __zero = (__v4sf) _mm_setzero_ps ();
     687    __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
     688    __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
     689  
     690    return (__m128) __builtin_ia32_movlhps (__ra, __rb);
     691  }
     692  
     693  /* Convert the low four signed 8-bit values in A to SPFP form.  */
     694  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     695  _mm_cvtpi8_ps (__m64 __A)
     696  {
     697    __v8qi __sign;
     698  
     699    /* This comparison against zero gives us a mask that can be used to
     700       fill in the missing sign bits in the unpack operations below, so
     701       that we get signed values after unpacking.  */
     702    __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
     703  
     704    /* Convert the four low bytes to words.  */
     705    __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
     706  
     707    return _mm_cvtpi16_ps(__A);
     708  }
     709  
     710  /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
     711  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     712  _mm_cvtpu8_ps(__m64 __A)
     713  {
     714    __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
     715    return _mm_cvtpu16_ps(__A);
     716  }
     717  
     718  /* Convert the four signed 32-bit values in A and B to SPFP form.  */
     719  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     720  _mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
     721  {
     722    __v4sf __zero = (__v4sf) _mm_setzero_ps ();
     723    __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
     724    __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
     725    return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
     726  }
     727  
     728  /* Convert the four SPFP values in A to four signed 16-bit integers.  */
     729  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     730  _mm_cvtps_pi16(__m128 __A)
     731  {
     732    __v4sf __hisf = (__v4sf)__A;
     733    __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
     734    __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
     735    __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
     736    return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
     737  }
     738  
     739  /* Convert the four SPFP values in A to four signed 8-bit integers.  */
     740  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     741  _mm_cvtps_pi8(__m128 __A)
     742  {
     743    __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
     744    return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
     745  }
     746  
     747  /* Selects four specific SPFP values from A and B based on MASK.  */
     748  #ifdef __OPTIMIZE__
     749  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     750  _mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
     751  {
     752    return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
     753  }
     754  #else
     755  #define _mm_shuffle_ps(A, B, MASK)					\
     756    ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),			\
     757  				   (__v4sf)(__m128)(B), (int)(MASK)))
     758  #endif
     759  
     760  /* Selects and interleaves the upper two SPFP values from A and B.  */
     761  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     762  _mm_unpackhi_ps (__m128 __A, __m128 __B)
     763  {
     764    return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
     765  }
     766  
     767  /* Selects and interleaves the lower two SPFP values from A and B.  */
     768  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     769  _mm_unpacklo_ps (__m128 __A, __m128 __B)
     770  {
     771    return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
     772  }
     773  
     774  /* Sets the upper two SPFP values with 64-bits of data loaded from P;
     775     the lower two values are passed through from A.  */
     776  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     777  _mm_loadh_pi (__m128 __A, __m64 const *__P)
     778  {
     779    return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
     780  }
     781  
     782  /* Stores the upper two SPFP values of A into P.  */
     783  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     784  _mm_storeh_pi (__m64 *__P, __m128 __A)
     785  {
     786    __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
     787  }
     788  
     789  /* Moves the upper two values of B into the lower two values of A.  */
     790  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     791  _mm_movehl_ps (__m128 __A, __m128 __B)
     792  {
     793    return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
     794  }
     795  
     796  /* Moves the lower two values of B into the upper two values of A.  */
     797  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     798  _mm_movelh_ps (__m128 __A, __m128 __B)
     799  {
     800    return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
     801  }
     802  
     803  /* Sets the lower two SPFP values with 64-bits of data loaded from P;
     804     the upper two values are passed through from A.  */
     805  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     806  _mm_loadl_pi (__m128 __A, __m64 const *__P)
     807  {
     808    return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
     809  }
     810  
     811  /* Stores the lower two SPFP values of A into P.  */
     812  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     813  _mm_storel_pi (__m64 *__P, __m128 __A)
     814  {
     815    __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
     816  }
     817  
     818  /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
     819  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     820  _mm_movemask_ps (__m128 __A)
     821  {
     822    return __builtin_ia32_movmskps ((__v4sf)__A);
     823  }
     824  
     825  /* Return the contents of the control register.  */
     826  extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     827  _mm_getcsr (void)
     828  {
     829    return __builtin_ia32_stmxcsr ();
     830  }
     831  
     832  /* Read exception bits from the control register.  */
     833  extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     834  _MM_GET_EXCEPTION_STATE (void)
     835  {
     836    return _mm_getcsr() & _MM_EXCEPT_MASK;
     837  }
     838  
     839  extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     840  _MM_GET_EXCEPTION_MASK (void)
     841  {
     842    return _mm_getcsr() & _MM_MASK_MASK;
     843  }
     844  
     845  extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     846  _MM_GET_ROUNDING_MODE (void)
     847  {
     848    return _mm_getcsr() & _MM_ROUND_MASK;
     849  }
     850  
     851  extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     852  _MM_GET_FLUSH_ZERO_MODE (void)
     853  {
     854    return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
     855  }
     856  
     857  /* Set the control register to I.  */
     858  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     859  _mm_setcsr (unsigned int __I)
     860  {
     861    __builtin_ia32_ldmxcsr (__I);
     862  }
     863  
     864  /* Set exception bits in the control register.  */
     865  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     866  _MM_SET_EXCEPTION_STATE(unsigned int __mask)
     867  {
     868    _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
     869  }
     870  
     871  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     872  _MM_SET_EXCEPTION_MASK (unsigned int __mask)
     873  {
     874    _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
     875  }
     876  
     877  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     878  _MM_SET_ROUNDING_MODE (unsigned int __mode)
     879  {
     880    _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
     881  }
     882  
     883  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     884  _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
     885  {
     886    _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
     887  }
     888  
     889  /* Create a vector with element 0 as F and the rest zero.  */
     890  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     891  _mm_set_ss (float __F)
     892  {
     893    return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
     894  }
     895  
     896  /* Create a vector with all four elements equal to F.  */
     897  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     898  _mm_set1_ps (float __F)
     899  {
     900    return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
     901  }
     902  
     903  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     904  _mm_set_ps1 (float __F)
     905  {
     906    return _mm_set1_ps (__F);
     907  }
     908  
     909  /* Create a vector with element 0 as *P and the rest zero.  */
     910  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     911  _mm_load_ss (float const *__P)
     912  {
     913    return _mm_set_ss (*__P);
     914  }
     915  
     916  /* Create a vector with all four elements equal to *P.  */
     917  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     918  _mm_load1_ps (float const *__P)
     919  {
     920    return _mm_set1_ps (*__P);
     921  }
     922  
     923  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     924  _mm_load_ps1 (float const *__P)
     925  {
     926    return _mm_load1_ps (__P);
     927  }
     928  
     929  /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
     930  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     931  _mm_load_ps (float const *__P)
     932  {
     933    return *(__m128 *)__P;
     934  }
     935  
     936  /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
     937  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     938  _mm_loadu_ps (float const *__P)
     939  {
     940    return *(__m128_u *)__P;
     941  }
     942  
     943  /* Load four SPFP values in reverse order.  The address must be aligned.  */
     944  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     945  _mm_loadr_ps (float const *__P)
     946  {
     947    __v4sf __tmp = *(__v4sf *)__P;
     948    return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
     949  }
     950  
     951  /* Create the vector [Z Y X W].  */
     952  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     953  _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
     954  {
     955    return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
     956  }
     957  
     958  /* Create the vector [W X Y Z].  */
     959  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     960  _mm_setr_ps (float __Z, float __Y, float __X, float __W)
     961  {
     962    return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
     963  }
     964  
     965  /* Stores the lower SPFP value.  */
     966  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     967  _mm_store_ss (float *__P, __m128 __A)
     968  {
     969    *__P = ((__v4sf)__A)[0];
     970  }
     971  
     972  extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     973  _mm_cvtss_f32 (__m128 __A)
     974  {
     975    return ((__v4sf)__A)[0];
     976  }
     977  
     978  /* Store four SPFP values.  The address must be 16-byte aligned.  */
     979  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     980  _mm_store_ps (float *__P, __m128 __A)
     981  {
     982    *(__m128 *)__P = __A;
     983  }
     984  
     985  /* Store four SPFP values.  The address need not be 16-byte aligned.  */
     986  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     987  _mm_storeu_ps (float *__P, __m128 __A)
     988  {
     989    *(__m128_u *)__P = __A;
     990  }
     991  
     992  /* Store the lower SPFP value across four words.  */
     993  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     994  _mm_store1_ps (float *__P, __m128 __A)
     995  {
     996    __v4sf __va = (__v4sf)__A;
     997    __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
     998    _mm_storeu_ps (__P, __tmp);
     999  }
    1000  
    1001  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1002  _mm_store_ps1 (float *__P, __m128 __A)
    1003  {
    1004    _mm_store1_ps (__P, __A);
    1005  }
    1006  
    1007  /* Store four SPFP values in reverse order.  The address must be aligned.  */
    1008  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1009  _mm_storer_ps (float *__P, __m128 __A)
    1010  {
    1011    __v4sf __va = (__v4sf)__A;
    1012    __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
    1013    _mm_store_ps (__P, __tmp);
    1014  }
    1015  
    1016  /* Sets the low SPFP value of A from the low value of B.  */
    1017  extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1018  _mm_move_ss (__m128 __A, __m128 __B)
    1019  {
    1020    return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
    1021                                       __extension__
    1022                                       (__attribute__((__vector_size__ (16))) int)
    1023                                       {4,1,2,3});
    1024  }
    1025  
    1026  /* Extracts one of the four words of A.  The selector N must be immediate.  */
    1027  #ifdef __OPTIMIZE__
    1028  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1029  _mm_extract_pi16 (__m64 const __A, int const __N)
    1030  {
    1031    return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
    1032  }
    1033  
    1034  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1035  _m_pextrw (__m64 const __A, int const __N)
    1036  {
    1037    return _mm_extract_pi16 (__A, __N);
    1038  }
    1039  #else
    1040  #define _mm_extract_pi16(A, N)	\
    1041    ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
    1042  
    1043  #define _m_pextrw(A, N) _mm_extract_pi16(A, N)
    1044  #endif
    1045  
    1046  /* Inserts word D into one of four words of A.  The selector N must be
    1047     immediate.  */
    1048  #ifdef __OPTIMIZE__
    1049  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1050  _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
    1051  {
    1052    return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
    1053  }
    1054  
    1055  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1056  _m_pinsrw (__m64 const __A, int const __D, int const __N)
    1057  {
    1058    return _mm_insert_pi16 (__A, __D, __N);
    1059  }
    1060  #else
    1061  #define _mm_insert_pi16(A, D, N)				\
    1062    ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),	\
    1063  					(int)(D), (int)(N)))
    1064  
    1065  #define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
    1066  #endif
    1067  
    1068  /* Compute the element-wise maximum of signed 16-bit values.  */
    1069  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1070  _mm_max_pi16 (__m64 __A, __m64 __B)
    1071  {
    1072    return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
    1073  }
    1074  
    1075  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1076  _m_pmaxsw (__m64 __A, __m64 __B)
    1077  {
    1078    return _mm_max_pi16 (__A, __B);
    1079  }
    1080  
    1081  /* Compute the element-wise maximum of unsigned 8-bit values.  */
    1082  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1083  _mm_max_pu8 (__m64 __A, __m64 __B)
    1084  {
    1085    return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
    1086  }
    1087  
    1088  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1089  _m_pmaxub (__m64 __A, __m64 __B)
    1090  {
    1091    return _mm_max_pu8 (__A, __B);
    1092  }
    1093  
    1094  /* Compute the element-wise minimum of signed 16-bit values.  */
    1095  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1096  _mm_min_pi16 (__m64 __A, __m64 __B)
    1097  {
    1098    return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
    1099  }
    1100  
    1101  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1102  _m_pminsw (__m64 __A, __m64 __B)
    1103  {
    1104    return _mm_min_pi16 (__A, __B);
    1105  }
    1106  
    1107  /* Compute the element-wise minimum of unsigned 8-bit values.  */
    1108  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1109  _mm_min_pu8 (__m64 __A, __m64 __B)
    1110  {
    1111    return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
    1112  }
    1113  
    1114  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1115  _m_pminub (__m64 __A, __m64 __B)
    1116  {
    1117    return _mm_min_pu8 (__A, __B);
    1118  }
    1119  
    1120  /* Create an 8-bit mask of the signs of 8-bit values.  */
    1121  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1122  _mm_movemask_pi8 (__m64 __A)
    1123  {
    1124    return __builtin_ia32_pmovmskb ((__v8qi)__A);
    1125  }
    1126  
    1127  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1128  _m_pmovmskb (__m64 __A)
    1129  {
    1130    return _mm_movemask_pi8 (__A);
    1131  }
    1132  
    1133  /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
    1134     in B and produce the high 16 bits of the 32-bit results.  */
    1135  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1136  _mm_mulhi_pu16 (__m64 __A, __m64 __B)
    1137  {
    1138    return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
    1139  }
    1140  
    1141  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1142  _m_pmulhuw (__m64 __A, __m64 __B)
    1143  {
    1144    return _mm_mulhi_pu16 (__A, __B);
    1145  }
    1146  
    1147  /* Return a combination of the four 16-bit values in A.  The selector
    1148     must be an immediate.  */
    1149  #ifdef __OPTIMIZE__
    1150  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1151  _mm_shuffle_pi16 (__m64 __A, int const __N)
    1152  {
    1153    return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
    1154  }
    1155  
    1156  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1157  _m_pshufw (__m64 __A, int const __N)
    1158  {
    1159    return _mm_shuffle_pi16 (__A, __N);
    1160  }
    1161  #else
    1162  #define _mm_shuffle_pi16(A, N) \
    1163    ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
    1164  
    1165  #define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
    1166  #endif
    1167  
    1168  /* Conditionally store byte elements of A into P.  The high bit of each
    1169     byte in the selector N determines whether the corresponding byte from
    1170     A is stored.  */
    1171  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1172  _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
    1173  {
    1174  #ifdef __MMX_WITH_SSE__
    1175    /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
    1176       64:127 at address __P.  */
    1177    typedef long long __v2di __attribute__ ((__vector_size__ (16)));
    1178    typedef char __v16qi __attribute__ ((__vector_size__ (16)));
    1179    /* Zero-extend __A and __N to 128 bits.  */
    1180    __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
    1181    __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
    1182  
    1183    /* Check the alignment of __P.  */
    1184    __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
    1185    if (offset)
    1186      {
    1187        /* If the misalignment of __P > 8, subtract __P by 8 bytes.
    1188  	 Otherwise, subtract __P by the misalignment.  */
    1189        if (offset > 8)
    1190  	offset = 8;
    1191        __P = (char *) (((__SIZE_TYPE__) __P) - offset);
    1192  
    1193        /* Shift __A128 and __N128 to the left by the adjustment.  */
    1194        switch (offset)
    1195  	{
    1196  	case 1:
    1197  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
    1198  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
    1199  	  break;
    1200  	case 2:
    1201  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
    1202  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
    1203  	  break;
    1204  	case 3:
    1205  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
    1206  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
    1207  	  break;
    1208  	case 4:
    1209  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
    1210  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
    1211  	  break;
    1212  	case 5:
    1213  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
    1214  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
    1215  	  break;
    1216  	case 6:
    1217  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
    1218  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
    1219  	  break;
    1220  	case 7:
    1221  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
    1222  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
    1223  	  break;
    1224  	case 8:
    1225  	  __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
    1226  	  __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
    1227  	  break;
    1228  	default:
    1229  	  break;
    1230  	}
    1231      }
    1232    __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
    1233  #else
    1234    __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
    1235  #endif
    1236  }
    1237  
    1238  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1239  _m_maskmovq (__m64 __A, __m64 __N, char *__P)
    1240  {
    1241    _mm_maskmove_si64 (__A, __N, __P);
    1242  }
    1243  
    1244  /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
    1245  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1246  _mm_avg_pu8 (__m64 __A, __m64 __B)
    1247  {
    1248    return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
    1249  }
    1250  
    1251  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1252  _m_pavgb (__m64 __A, __m64 __B)
    1253  {
    1254    return _mm_avg_pu8 (__A, __B);
    1255  }
    1256  
    1257  /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
    1258  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1259  _mm_avg_pu16 (__m64 __A, __m64 __B)
    1260  {
    1261    return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
    1262  }
    1263  
    1264  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1265  _m_pavgw (__m64 __A, __m64 __B)
    1266  {
    1267    return _mm_avg_pu16 (__A, __B);
    1268  }
    1269  
    1270  /* Compute the sum of the absolute differences of the unsigned 8-bit
    1271     values in A and B.  Return the value in the lower 16-bit word; the
    1272     upper words are cleared.  */
    1273  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1274  _mm_sad_pu8 (__m64 __A, __m64 __B)
    1275  {
    1276    return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
    1277  }
    1278  
    1279  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1280  _m_psadbw (__m64 __A, __m64 __B)
    1281  {
    1282    return _mm_sad_pu8 (__A, __B);
    1283  }
    1284  
    1285  /* Stores the data in A to the address P without polluting the caches.  */
    1286  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1287  _mm_stream_pi (__m64 *__P, __m64 __A)
    1288  {
    1289    __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
    1290  }
    1291  
    1292  /* Likewise.  The address must be 16-byte aligned.  */
    1293  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1294  _mm_stream_ps (float *__P, __m128 __A)
    1295  {
    1296    __builtin_ia32_movntps (__P, (__v4sf)__A);
    1297  }
    1298  
    1299  /* Guarantees that every preceding store is globally visible before
    1300     any subsequent store.  */
    1301  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1302  _mm_sfence (void)
    1303  {
    1304    __builtin_ia32_sfence ();
    1305  }
    1306  
    1307  /* Transpose the 4x4 matrix composed of row[0-3].  */
    1308  #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
    1309  do {									\
    1310    __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
    1311    __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);			\
    1312    __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);			\
    1313    __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);			\
    1314    __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);			\
    1315    (row0) = __builtin_ia32_movlhps (__t0, __t1);				\
    1316    (row1) = __builtin_ia32_movhlps (__t1, __t0);				\
    1317    (row2) = __builtin_ia32_movlhps (__t2, __t3);				\
    1318    (row3) = __builtin_ia32_movhlps (__t3, __t2);				\
    1319  } while (0)
    1320  
    1321  /* For backward source compatibility.  */
    1322  # include <emmintrin.h>
    1323  
    1324  #ifdef __DISABLE_SSE__
    1325  #undef __DISABLE_SSE__
    1326  #pragma GCC pop_options
    1327  #endif /* __DISABLE_SSE__ */
    1328  
    1329  /* The execution of the next instruction is delayed by an implementation
    1330     specific amount of time.  The instruction does not modify the
    1331     architectural state.  This is after the pop_options pragma because
    1332     it does not require SSE support in the processor--the encoding is a
    1333     nop on processors that do not support it.  */
    1334  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1335  _mm_pause (void)
    1336  {
    1337    __builtin_ia32_pause ();
    1338  }
    1339  
    1340  #endif /* _XMMINTRIN_H_INCLUDED */