1  /* Copyright (C) 2002-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3, or (at your option)
       8     any later version.
       9  
      10     GCC is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  /* Implemented from the specification included in the Intel C++ Compiler
      25     User Guide and Reference, version 9.0.  */
      26  
      27  #ifndef _MMINTRIN_H_INCLUDED
      28  #define _MMINTRIN_H_INCLUDED
      29  
      30  #if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
      31  #pragma GCC push_options
      32  #ifdef __MMX_WITH_SSE__
      33  #pragma GCC target("sse2")
      34  #elif defined __x86_64__
      35  #pragma GCC target("sse,mmx")
      36  #else
      37  #pragma GCC target("mmx")
      38  #endif
      39  #define __DISABLE_MMX__
      40  #endif /* __MMX__ */
      41  
      42  /* The Intel API is flexible enough that we must allow aliasing with other
      43     vector types, and their scalar components.  */
      44  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
      45  typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__));
      46  typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__));
      47  
      48  /* Unaligned version of the same type  */
      49  typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
      50  typedef int __m32_u __attribute__ ((__vector_size__ (4), \
      51  				    __may_alias__, __aligned__ (1)));
      52  typedef short __m16_u __attribute__ ((__vector_size__ (2), \
      53  				      __may_alias__, __aligned__ (1)));
      54  
      55  /* Internal data types for implementing the intrinsics.  */
      56  typedef int __v2si __attribute__ ((__vector_size__ (8)));
      57  typedef short __v4hi __attribute__ ((__vector_size__ (8)));
      58  typedef char __v8qi __attribute__ ((__vector_size__ (8)));
      59  typedef long long __v1di __attribute__ ((__vector_size__ (8)));
      60  typedef float __v2sf __attribute__ ((__vector_size__ (8)));
      61  
      62  /* Empty the multimedia state.  */
      63  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      64  _mm_empty (void)
      65  {
      66    __builtin_ia32_emms ();
      67  }
      68  
      69  extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      70  _m_empty (void)
      71  {
      72    _mm_empty ();
      73  }
      74  
      75  /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
      76  extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      77  _mm_cvtsi32_si64 (int __i)
      78  {
      79    return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
      80  }
      81  
      82  extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      83  _m_from_int (int __i)
      84  {
      85    return _mm_cvtsi32_si64 (__i);
      86  }
      87  
      88  #ifdef __x86_64__
      89  /* Convert I to a __m64 object.  */
      90  
      91  /* Intel intrinsic.  */
      92  extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      93  _m_from_int64 (long long __i)
      94  {
      95    return (__m64) __i;
      96  }
      97  
      98  extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      99  _mm_cvtsi64_m64 (long long __i)
     100  {
     101    return (__m64) __i;
     102  }
     103  
     104  /* Microsoft intrinsic.  */
     105  extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     106  _mm_cvtsi64x_si64 (long long __i)
     107  {
     108    return (__m64) __i;
     109  }
     110  
     111  extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     112  _mm_set_pi64x (long long __i)
     113  {
     114    return (__m64) __i;
     115  }
     116  #endif
     117  
     118  /* Convert the lower 32 bits of the __m64 object into an integer.  */
     119  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     120  _mm_cvtsi64_si32 (__m64 __i)
     121  {
     122    return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
     123  }
     124  
     125  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     126  _m_to_int (__m64 __i)
     127  {
     128    return _mm_cvtsi64_si32 (__i);
     129  }
     130  
     131  #ifdef __x86_64__
     132  /* Convert the __m64 object to a 64bit integer.  */
     133  
     134  /* Intel intrinsic.  */
     135  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     136  _m_to_int64 (__m64 __i)
     137  {
     138    return (long long)__i;
     139  }
     140  
     141  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     142  _mm_cvtm64_si64 (__m64 __i)
     143  {
     144    return (long long)__i;
     145  }
     146  
     147  /* Microsoft intrinsic.  */
     148  extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     149  _mm_cvtsi64_si64x (__m64 __i)
     150  {
     151    return (long long)__i;
     152  }
     153  #endif
     154  
     155  /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
     156     the result, and the four 16-bit values from M2 into the upper four 8-bit
     157     values of the result, all with signed saturation.  */
     158  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     159  _mm_packs_pi16 (__m64 __m1, __m64 __m2)
     160  {
     161    return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
     162  }
     163  
     164  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     165  _m_packsswb (__m64 __m1, __m64 __m2)
     166  {
     167    return _mm_packs_pi16 (__m1, __m2);
     168  }
     169  
     170  /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
     171     the result, and the two 32-bit values from M2 into the upper two 16-bit
     172     values of the result, all with signed saturation.  */
     173  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     174  _mm_packs_pi32 (__m64 __m1, __m64 __m2)
     175  {
     176    return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
     177  }
     178  
     179  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     180  _m_packssdw (__m64 __m1, __m64 __m2)
     181  {
     182    return _mm_packs_pi32 (__m1, __m2);
     183  }
     184  
     185  /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
     186     the result, and the four 16-bit values from M2 into the upper four 8-bit
     187     values of the result, all with unsigned saturation.  */
     188  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     189  _mm_packs_pu16 (__m64 __m1, __m64 __m2)
     190  {
     191    return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
     192  }
     193  
     194  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     195  _m_packuswb (__m64 __m1, __m64 __m2)
     196  {
     197    return _mm_packs_pu16 (__m1, __m2);
     198  }
     199  
     200  /* Interleave the four 8-bit values from the high half of M1 with the four
     201     8-bit values from the high half of M2.  */
     202  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     203  _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
     204  {
     205    return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
     206  }
     207  
     208  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     209  _m_punpckhbw (__m64 __m1, __m64 __m2)
     210  {
     211    return _mm_unpackhi_pi8 (__m1, __m2);
     212  }
     213  
     214  /* Interleave the two 16-bit values from the high half of M1 with the two
     215     16-bit values from the high half of M2.  */
     216  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     217  _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
     218  {
     219    return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
     220  }
     221  
     222  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     223  _m_punpckhwd (__m64 __m1, __m64 __m2)
     224  {
     225    return _mm_unpackhi_pi16 (__m1, __m2);
     226  }
     227  
     228  /* Interleave the 32-bit value from the high half of M1 with the 32-bit
     229     value from the high half of M2.  */
     230  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     231  _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
     232  {
     233    return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
     234  }
     235  
     236  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     237  _m_punpckhdq (__m64 __m1, __m64 __m2)
     238  {
     239    return _mm_unpackhi_pi32 (__m1, __m2);
     240  }
     241  
     242  /* Interleave the four 8-bit values from the low half of M1 with the four
     243     8-bit values from the low half of M2.  */
     244  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     245  _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
     246  {
     247    return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
     248  }
     249  
     250  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     251  _m_punpcklbw (__m64 __m1, __m64 __m2)
     252  {
     253    return _mm_unpacklo_pi8 (__m1, __m2);
     254  }
     255  
     256  /* Interleave the two 16-bit values from the low half of M1 with the two
     257     16-bit values from the low half of M2.  */
     258  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     259  _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
     260  {
     261    return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
     262  }
     263  
     264  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     265  _m_punpcklwd (__m64 __m1, __m64 __m2)
     266  {
     267    return _mm_unpacklo_pi16 (__m1, __m2);
     268  }
     269  
     270  /* Interleave the 32-bit value from the low half of M1 with the 32-bit
     271     value from the low half of M2.  */
     272  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     273  _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
     274  {
     275    return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
     276  }
     277  
     278  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     279  _m_punpckldq (__m64 __m1, __m64 __m2)
     280  {
     281    return _mm_unpacklo_pi32 (__m1, __m2);
     282  }
     283  
     284  /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
     285  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     286  _mm_add_pi8 (__m64 __m1, __m64 __m2)
     287  {
     288    return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
     289  }
     290  
     291  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     292  _m_paddb (__m64 __m1, __m64 __m2)
     293  {
     294    return _mm_add_pi8 (__m1, __m2);
     295  }
     296  
     297  /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
     298  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     299  _mm_add_pi16 (__m64 __m1, __m64 __m2)
     300  {
     301    return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
     302  }
     303  
     304  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     305  _m_paddw (__m64 __m1, __m64 __m2)
     306  {
     307    return _mm_add_pi16 (__m1, __m2);
     308  }
     309  
     310  /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
     311  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     312  _mm_add_pi32 (__m64 __m1, __m64 __m2)
     313  {
     314    return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
     315  }
     316  
     317  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     318  _m_paddd (__m64 __m1, __m64 __m2)
     319  {
     320    return _mm_add_pi32 (__m1, __m2);
     321  }
     322  
     323  /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
     324  #ifndef __SSE2__
     325  #pragma GCC push_options
     326  #ifdef __MMX_WITH_SSE__
     327  #pragma GCC target("sse2")
     328  #else
     329  #pragma GCC target("sse2,mmx")
     330  #endif
     331  #define __DISABLE_SSE2__
     332  #endif /* __SSE2__ */
     333  
     334  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     335  _mm_add_si64 (__m64 __m1, __m64 __m2)
     336  {
     337    return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
     338  }
     339  #ifdef __DISABLE_SSE2__
     340  #undef __DISABLE_SSE2__
     341  #pragma GCC pop_options
     342  #endif /* __DISABLE_SSE2__ */
     343  
     344  /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
     345     saturated arithmetic.  */
     346  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     347  _mm_adds_pi8 (__m64 __m1, __m64 __m2)
     348  {
     349    return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
     350  }
     351  
     352  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     353  _m_paddsb (__m64 __m1, __m64 __m2)
     354  {
     355    return _mm_adds_pi8 (__m1, __m2);
     356  }
     357  
     358  /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
     359     saturated arithmetic.  */
     360  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     361  _mm_adds_pi16 (__m64 __m1, __m64 __m2)
     362  {
     363    return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
     364  }
     365  
     366  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     367  _m_paddsw (__m64 __m1, __m64 __m2)
     368  {
     369    return _mm_adds_pi16 (__m1, __m2);
     370  }
     371  
     372  /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
     373     saturated arithmetic.  */
     374  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     375  _mm_adds_pu8 (__m64 __m1, __m64 __m2)
     376  {
     377    return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
     378  }
     379  
     380  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     381  _m_paddusb (__m64 __m1, __m64 __m2)
     382  {
     383    return _mm_adds_pu8 (__m1, __m2);
     384  }
     385  
     386  /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
     387     saturated arithmetic.  */
     388  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     389  _mm_adds_pu16 (__m64 __m1, __m64 __m2)
     390  {
     391    return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
     392  }
     393  
     394  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     395  _m_paddusw (__m64 __m1, __m64 __m2)
     396  {
     397    return _mm_adds_pu16 (__m1, __m2);
     398  }
     399  
     400  /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
     401  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     402  _mm_sub_pi8 (__m64 __m1, __m64 __m2)
     403  {
     404    return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
     405  }
     406  
     407  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     408  _m_psubb (__m64 __m1, __m64 __m2)
     409  {
     410    return _mm_sub_pi8 (__m1, __m2);
     411  }
     412  
     413  /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
     414  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     415  _mm_sub_pi16 (__m64 __m1, __m64 __m2)
     416  {
     417    return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
     418  }
     419  
     420  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     421  _m_psubw (__m64 __m1, __m64 __m2)
     422  {
     423    return _mm_sub_pi16 (__m1, __m2);
     424  }
     425  
     426  /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
     427  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     428  _mm_sub_pi32 (__m64 __m1, __m64 __m2)
     429  {
     430    return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
     431  }
     432  
     433  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     434  _m_psubd (__m64 __m1, __m64 __m2)
     435  {
     436    return _mm_sub_pi32 (__m1, __m2);
     437  }
     438  
     439  /* Add the 64-bit values in M1 to the 64-bit values in M2.  */
     440  #ifndef __SSE2__
     441  #pragma GCC push_options
     442  #ifdef __MMX_WITH_SSE__
     443  #pragma GCC target("sse2")
     444  #else
     445  #pragma GCC target("sse2,mmx")
     446  #endif
     447  #define __DISABLE_SSE2__
     448  #endif /* __SSE2__ */
     449  
     450  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     451  _mm_sub_si64 (__m64 __m1, __m64 __m2)
     452  {
     453    return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
     454  }
     455  #ifdef __DISABLE_SSE2__
     456  #undef __DISABLE_SSE2__
     457  #pragma GCC pop_options
     458  #endif /* __DISABLE_SSE2__ */
     459  
     460  /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
     461     saturating arithmetic.  */
     462  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     463  _mm_subs_pi8 (__m64 __m1, __m64 __m2)
     464  {
     465    return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
     466  }
     467  
     468  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     469  _m_psubsb (__m64 __m1, __m64 __m2)
     470  {
     471    return _mm_subs_pi8 (__m1, __m2);
     472  }
     473  
     474  /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
     475     signed saturating arithmetic.  */
     476  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     477  _mm_subs_pi16 (__m64 __m1, __m64 __m2)
     478  {
     479    return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
     480  }
     481  
     482  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     483  _m_psubsw (__m64 __m1, __m64 __m2)
     484  {
     485    return _mm_subs_pi16 (__m1, __m2);
     486  }
     487  
     488  /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
     489     unsigned saturating arithmetic.  */
     490  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     491  _mm_subs_pu8 (__m64 __m1, __m64 __m2)
     492  {
     493    return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
     494  }
     495  
     496  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     497  _m_psubusb (__m64 __m1, __m64 __m2)
     498  {
     499    return _mm_subs_pu8 (__m1, __m2);
     500  }
     501  
     502  /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
     503     unsigned saturating arithmetic.  */
     504  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     505  _mm_subs_pu16 (__m64 __m1, __m64 __m2)
     506  {
     507    return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
     508  }
     509  
     510  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     511  _m_psubusw (__m64 __m1, __m64 __m2)
     512  {
     513    return _mm_subs_pu16 (__m1, __m2);
     514  }
     515  
     516  /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
     517     four 32-bit intermediate results, which are then summed by pairs to
     518     produce two 32-bit results.  */
     519  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     520  _mm_madd_pi16 (__m64 __m1, __m64 __m2)
     521  {
     522    return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
     523  }
     524  
     525  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     526  _m_pmaddwd (__m64 __m1, __m64 __m2)
     527  {
     528    return _mm_madd_pi16 (__m1, __m2);
     529  }
     530  
     531  /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
     532     M2 and produce the high 16 bits of the 32-bit results.  */
     533  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     534  _mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
     535  {
     536    return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
     537  }
     538  
     539  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     540  _m_pmulhw (__m64 __m1, __m64 __m2)
     541  {
     542    return _mm_mulhi_pi16 (__m1, __m2);
     543  }
     544  
     545  /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
     546     the low 16 bits of the results.  */
     547  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     548  _mm_mullo_pi16 (__m64 __m1, __m64 __m2)
     549  {
     550    return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
     551  }
     552  
     553  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     554  _m_pmullw (__m64 __m1, __m64 __m2)
     555  {
     556    return _mm_mullo_pi16 (__m1, __m2);
     557  }
     558  
     559  /* Shift four 16-bit values in M left by COUNT.  */
     560  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     561  _mm_sll_pi16 (__m64 __m, __m64 __count)
     562  {
     563    return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
     564  }
     565  
     566  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     567  _m_psllw (__m64 __m, __m64 __count)
     568  {
     569    return _mm_sll_pi16 (__m, __count);
     570  }
     571  
     572  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     573  _mm_slli_pi16 (__m64 __m, int __count)
     574  {
     575    return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
     576  }
     577  
     578  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     579  _m_psllwi (__m64 __m, int __count)
     580  {
     581    return _mm_slli_pi16 (__m, __count);
     582  }
     583  
     584  /* Shift two 32-bit values in M left by COUNT.  */
     585  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     586  _mm_sll_pi32 (__m64 __m, __m64 __count)
     587  {
     588    return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
     589  }
     590  
     591  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     592  _m_pslld (__m64 __m, __m64 __count)
     593  {
     594    return _mm_sll_pi32 (__m, __count);
     595  }
     596  
     597  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     598  _mm_slli_pi32 (__m64 __m, int __count)
     599  {
     600    return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
     601  }
     602  
     603  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     604  _m_pslldi (__m64 __m, int __count)
     605  {
     606    return _mm_slli_pi32 (__m, __count);
     607  }
     608  
     609  /* Shift the 64-bit value in M left by COUNT.  */
     610  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     611  _mm_sll_si64 (__m64 __m, __m64 __count)
     612  {
     613    return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
     614  }
     615  
     616  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     617  _m_psllq (__m64 __m, __m64 __count)
     618  {
     619    return _mm_sll_si64 (__m, __count);
     620  }
     621  
     622  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     623  _mm_slli_si64 (__m64 __m, int __count)
     624  {
     625    return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
     626  }
     627  
     628  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     629  _m_psllqi (__m64 __m, int __count)
     630  {
     631    return _mm_slli_si64 (__m, __count);
     632  }
     633  
     634  /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
     635  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     636  _mm_sra_pi16 (__m64 __m, __m64 __count)
     637  {
     638    return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
     639  }
     640  
     641  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     642  _m_psraw (__m64 __m, __m64 __count)
     643  {
     644    return _mm_sra_pi16 (__m, __count);
     645  }
     646  
     647  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     648  _mm_srai_pi16 (__m64 __m, int __count)
     649  {
     650    return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
     651  }
     652  
     653  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     654  _m_psrawi (__m64 __m, int __count)
     655  {
     656    return _mm_srai_pi16 (__m, __count);
     657  }
     658  
     659  /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
     660  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     661  _mm_sra_pi32 (__m64 __m, __m64 __count)
     662  {
     663    return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
     664  }
     665  
     666  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     667  _m_psrad (__m64 __m, __m64 __count)
     668  {
     669    return _mm_sra_pi32 (__m, __count);
     670  }
     671  
     672  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     673  _mm_srai_pi32 (__m64 __m, int __count)
     674  {
     675    return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
     676  }
     677  
     678  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     679  _m_psradi (__m64 __m, int __count)
     680  {
     681    return _mm_srai_pi32 (__m, __count);
     682  }
     683  
     684  /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
     685  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     686  _mm_srl_pi16 (__m64 __m, __m64 __count)
     687  {
     688    return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
     689  }
     690  
     691  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     692  _m_psrlw (__m64 __m, __m64 __count)
     693  {
     694    return _mm_srl_pi16 (__m, __count);
     695  }
     696  
     697  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     698  _mm_srli_pi16 (__m64 __m, int __count)
     699  {
     700    return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
     701  }
     702  
     703  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     704  _m_psrlwi (__m64 __m, int __count)
     705  {
     706    return _mm_srli_pi16 (__m, __count);
     707  }
     708  
     709  /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
     710  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     711  _mm_srl_pi32 (__m64 __m, __m64 __count)
     712  {
     713    return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
     714  }
     715  
     716  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     717  _m_psrld (__m64 __m, __m64 __count)
     718  {
     719    return _mm_srl_pi32 (__m, __count);
     720  }
     721  
     722  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     723  _mm_srli_pi32 (__m64 __m, int __count)
     724  {
     725    return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
     726  }
     727  
     728  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     729  _m_psrldi (__m64 __m, int __count)
     730  {
     731    return _mm_srli_pi32 (__m, __count);
     732  }
     733  
     734  /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
     735  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     736  _mm_srl_si64 (__m64 __m, __m64 __count)
     737  {
     738    return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
     739  }
     740  
     741  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     742  _m_psrlq (__m64 __m, __m64 __count)
     743  {
     744    return _mm_srl_si64 (__m, __count);
     745  }
     746  
     747  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     748  _mm_srli_si64 (__m64 __m, int __count)
     749  {
     750    return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
     751  }
     752  
     753  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     754  _m_psrlqi (__m64 __m, int __count)
     755  {
     756    return _mm_srli_si64 (__m, __count);
     757  }
     758  
     759  /* Bit-wise AND the 64-bit values in M1 and M2.  */
     760  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     761  _mm_and_si64 (__m64 __m1, __m64 __m2)
     762  {
     763    return __builtin_ia32_pand (__m1, __m2);
     764  }
     765  
     766  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     767  _m_pand (__m64 __m1, __m64 __m2)
     768  {
     769    return _mm_and_si64 (__m1, __m2);
     770  }
     771  
     772  /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
     773     64-bit value in M2.  */
     774  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     775  _mm_andnot_si64 (__m64 __m1, __m64 __m2)
     776  {
     777    return __builtin_ia32_pandn (__m1, __m2);
     778  }
     779  
     780  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     781  _m_pandn (__m64 __m1, __m64 __m2)
     782  {
     783    return _mm_andnot_si64 (__m1, __m2);
     784  }
     785  
     786  /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
     787  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     788  _mm_or_si64 (__m64 __m1, __m64 __m2)
     789  {
     790    return __builtin_ia32_por (__m1, __m2);
     791  }
     792  
     793  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     794  _m_por (__m64 __m1, __m64 __m2)
     795  {
     796    return _mm_or_si64 (__m1, __m2);
     797  }
     798  
     799  /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
     800  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     801  _mm_xor_si64 (__m64 __m1, __m64 __m2)
     802  {
     803    return __builtin_ia32_pxor (__m1, __m2);
     804  }
     805  
     806  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     807  _m_pxor (__m64 __m1, __m64 __m2)
     808  {
     809    return _mm_xor_si64 (__m1, __m2);
     810  }
     811  
     812  /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
     813     test is true and zero if false.  */
     814  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     815  _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
     816  {
     817    return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
     818  }
     819  
     820  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     821  _m_pcmpeqb (__m64 __m1, __m64 __m2)
     822  {
     823    return _mm_cmpeq_pi8 (__m1, __m2);
     824  }
     825  
     826  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     827  _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
     828  {
     829    return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
     830  }
     831  
     832  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     833  _m_pcmpgtb (__m64 __m1, __m64 __m2)
     834  {
     835    return _mm_cmpgt_pi8 (__m1, __m2);
     836  }
     837  
     838  /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
     839     the test is true and zero if false.  */
     840  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     841  _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
     842  {
     843    return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
     844  }
     845  
     846  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     847  _m_pcmpeqw (__m64 __m1, __m64 __m2)
     848  {
     849    return _mm_cmpeq_pi16 (__m1, __m2);
     850  }
     851  
     852  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     853  _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
     854  {
     855    return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
     856  }
     857  
     858  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     859  _m_pcmpgtw (__m64 __m1, __m64 __m2)
     860  {
     861    return _mm_cmpgt_pi16 (__m1, __m2);
     862  }
     863  
     864  /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
     865     the test is true and zero if false.  */
     866  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     867  _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
     868  {
     869    return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
     870  }
     871  
     872  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     873  _m_pcmpeqd (__m64 __m1, __m64 __m2)
     874  {
     875    return _mm_cmpeq_pi32 (__m1, __m2);
     876  }
     877  
     878  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     879  _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
     880  {
     881    return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
     882  }
     883  
     884  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     885  _m_pcmpgtd (__m64 __m1, __m64 __m2)
     886  {
     887    return _mm_cmpgt_pi32 (__m1, __m2);
     888  }
     889  
     890  /* Creates a 64-bit zero.  */
     891  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     892  _mm_setzero_si64 (void)
     893  {
     894    return (__m64)0LL;
     895  }
     896  
     897  /* Creates a vector of two 32-bit values; I0 is least significant.  */
     898  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     899  _mm_set_pi32 (int __i1, int __i0)
     900  {
     901    return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
     902  }
     903  
     904  /* Creates a vector of four 16-bit values; W0 is least significant.  */
     905  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     906  _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
     907  {
     908    return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
     909  }
     910  
     911  /* Creates a vector of eight 8-bit values; B0 is least significant.  */
     912  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     913  _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
     914  	     char __b3, char __b2, char __b1, char __b0)
     915  {
     916    return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
     917  					       __b4, __b5, __b6, __b7);
     918  }
     919  
     920  /* Similar, but with the arguments in reverse order.  */
     921  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     922  _mm_setr_pi32 (int __i0, int __i1)
     923  {
     924    return _mm_set_pi32 (__i1, __i0);
     925  }
     926  
     927  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     928  _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
     929  {
     930    return _mm_set_pi16 (__w3, __w2, __w1, __w0);
     931  }
     932  
     933  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     934  _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
     935  	      char __b4, char __b5, char __b6, char __b7)
     936  {
     937    return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
     938  }
     939  
     940  /* Creates a vector of two 32-bit values, both elements containing I.  */
     941  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     942  _mm_set1_pi32 (int __i)
     943  {
     944    return _mm_set_pi32 (__i, __i);
     945  }
     946  
     947  /* Creates a vector of four 16-bit values, all elements containing W.  */
     948  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     949  _mm_set1_pi16 (short __w)
     950  {
     951    return _mm_set_pi16 (__w, __w, __w, __w);
     952  }
     953  
     954  /* Creates a vector of eight 8-bit values, all elements containing B.  */
     955  extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     956  _mm_set1_pi8 (char __b)
     957  {
     958    return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
     959  }
     960  #ifdef __DISABLE_MMX__
     961  #undef __DISABLE_MMX__
     962  #pragma GCC pop_options
     963  #endif /* __DISABLE_MMX__ */
     964  
     965  #endif /* _MMINTRIN_H_INCLUDED */