1  /* Copyright (C) 2018-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3, or (at your option)
       8     any later version.
       9  
      10     GCC is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  /* Implemented from the specification included in the Intel C++ Compiler
      25     User Guide and Reference, version 9.0.
      26  
      27     NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
      28  
      29  #ifndef NO_WARN_X86_INTRINSICS
      30  /* This header is distributed to simplify porting x86_64 code that
      31     makes explicit use of Intel intrinsics to powerpc64le.
      32     It is the user's responsibility to determine if the results are
      33     acceptable and make additional changes as necessary.
      34     Note that much code that uses Intel intrinsics can be rewritten in
      35     standard C or GNU C extensions, which are more portable and better
      36     optimized across multiple targets.  */
      37  #endif
      38  
      39  #ifndef SMMINTRIN_H_
      40  #define SMMINTRIN_H_
      41  
      42  #include <altivec.h>
      43  #include <tmmintrin.h>
      44  
      45  /* Rounding mode macros. */
      46  #define _MM_FROUND_TO_NEAREST_INT       0x00
      47  #define _MM_FROUND_TO_ZERO              0x01
      48  #define _MM_FROUND_TO_POS_INF           0x02
      49  #define _MM_FROUND_TO_NEG_INF           0x03
      50  #define _MM_FROUND_CUR_DIRECTION        0x04
      51  
      52  #define _MM_FROUND_NINT		\
      53    (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
      54  #define _MM_FROUND_FLOOR	\
      55    (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
      56  #define _MM_FROUND_CEIL		\
      57    (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
      58  #define _MM_FROUND_TRUNC	\
      59    (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
      60  #define _MM_FROUND_RINT		\
      61    (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
      62  #define _MM_FROUND_NEARBYINT	\
      63    (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
      64  
      65  #define _MM_FROUND_RAISE_EXC            0x00
      66  #define _MM_FROUND_NO_EXC               0x08
      67  
      68  extern __inline __m128d
      69  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
      70  _mm_round_pd (__m128d __A, int __rounding)
      71  {
      72    __v2df __r;
      73    union {
      74      double __fr;
      75      long long __fpscr;
      76    } __enables_save, __fpscr_save;
      77  
      78    if (__rounding & _MM_FROUND_NO_EXC)
      79      {
      80        /* Save enabled exceptions, disable all exceptions,
      81  	 and preserve the rounding mode.  */
      82  #ifdef _ARCH_PWR9
      83        __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
      84        __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
      85  #else
      86        __fpscr_save.__fr = __builtin_mffs ();
      87        __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
      88        __fpscr_save.__fpscr &= ~0xf8;
      89        __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
      90  #endif
      91        /* Insert an artificial "read/write" reference to the variable
      92  	 read below, to ensure the compiler does not schedule
      93  	 a read/use of the variable before the FPSCR is modified, above.
      94  	 This can be removed if and when GCC PR102783 is fixed.
      95         */
      96        __asm__ ("" : "+wa" (__A));
      97      }
      98  
      99    switch (__rounding)
     100      {
     101        case _MM_FROUND_TO_NEAREST_INT:
     102  	__fpscr_save.__fr = __builtin_mffsl ();
     103  	__attribute__ ((fallthrough));
     104        case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
     105  	__builtin_set_fpscr_rn (0b00);
     106  	/* Insert an artificial "read/write" reference to the variable
     107  	   read below, to ensure the compiler does not schedule
     108  	   a read/use of the variable before the FPSCR is modified, above.
     109  	   This can be removed if and when GCC PR102783 is fixed.
     110  	 */
     111  	__asm__ ("" : "+wa" (__A));
     112  
     113  	__r = vec_rint ((__v2df) __A);
     114  
     115  	/* Insert an artificial "read" reference to the variable written
     116  	   above, to ensure the compiler does not schedule the computation
     117  	   of the value after the manipulation of the FPSCR, below.
     118  	   This can be removed if and when GCC PR102783 is fixed.
     119  	 */
     120  	__asm__ ("" : : "wa" (__r));
     121  	__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
     122  	break;
     123        case _MM_FROUND_TO_NEG_INF:
     124        case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
     125  	__r = vec_floor ((__v2df) __A);
     126  	break;
     127        case _MM_FROUND_TO_POS_INF:
     128        case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
     129  	__r = vec_ceil ((__v2df) __A);
     130  	break;
     131        case _MM_FROUND_TO_ZERO:
     132        case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
     133  	__r = vec_trunc ((__v2df) __A);
     134  	break;
     135        case _MM_FROUND_CUR_DIRECTION:
     136  	__r = vec_rint ((__v2df) __A);
     137  	break;
     138      }
     139    if (__rounding & _MM_FROUND_NO_EXC)
     140      {
     141        /* Insert an artificial "read" reference to the variable written
     142  	 above, to ensure the compiler does not schedule the computation
     143  	 of the value after the manipulation of the FPSCR, below.
     144  	 This can be removed if and when GCC PR102783 is fixed.
     145         */
     146        __asm__ ("" : : "wa" (__r));
     147        /* Restore enabled exceptions.  */
     148        __fpscr_save.__fr = __builtin_mffsl ();
     149        __fpscr_save.__fpscr |= __enables_save.__fpscr;
     150        __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
     151      }
     152    return (__m128d) __r;
     153  }
     154  
     155  extern __inline __m128d
     156  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     157  _mm_round_sd (__m128d __A, __m128d __B, int __rounding)
     158  {
     159    __B = _mm_round_pd (__B, __rounding);
     160    __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
     161    return (__m128d) __r;
     162  }
     163  
     164  extern __inline __m128
     165  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     166  _mm_round_ps (__m128 __A, int __rounding)
     167  {
     168    __v4sf __r;
     169    union {
     170      double __fr;
     171      long long __fpscr;
     172    } __enables_save, __fpscr_save;
     173  
     174    if (__rounding & _MM_FROUND_NO_EXC)
     175      {
     176        /* Save enabled exceptions, disable all exceptions,
     177  	 and preserve the rounding mode.  */
     178  #ifdef _ARCH_PWR9
     179        __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
     180        __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     181  #else
     182        __fpscr_save.__fr = __builtin_mffs ();
     183        __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     184        __fpscr_save.__fpscr &= ~0xf8;
     185        __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
     186  #endif
     187        /* Insert an artificial "read/write" reference to the variable
     188  	 read below, to ensure the compiler does not schedule
     189  	 a read/use of the variable before the FPSCR is modified, above.
     190  	 This can be removed if and when GCC PR102783 is fixed.
     191         */
     192        __asm__ ("" : "+wa" (__A));
     193      }
     194  
     195    switch (__rounding)
     196      {
     197        case _MM_FROUND_TO_NEAREST_INT:
     198  	__fpscr_save.__fr = __builtin_mffsl ();
     199  	__attribute__ ((fallthrough));
     200        case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
     201  	__builtin_set_fpscr_rn (0b00);
     202  	/* Insert an artificial "read/write" reference to the variable
     203  	   read below, to ensure the compiler does not schedule
     204  	   a read/use of the variable before the FPSCR is modified, above.
     205  	   This can be removed if and when GCC PR102783 is fixed.
     206  	 */
     207  	__asm__ ("" : "+wa" (__A));
     208  
     209  	__r = vec_rint ((__v4sf) __A);
     210  
     211  	/* Insert an artificial "read" reference to the variable written
     212  	   above, to ensure the compiler does not schedule the computation
     213  	   of the value after the manipulation of the FPSCR, below.
     214  	   This can be removed if and when GCC PR102783 is fixed.
     215  	 */
     216  	__asm__ ("" : : "wa" (__r));
     217  	__builtin_set_fpscr_rn (__fpscr_save.__fpscr);
     218  	break;
     219        case _MM_FROUND_TO_NEG_INF:
     220        case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
     221  	__r = vec_floor ((__v4sf) __A);
     222  	break;
     223        case _MM_FROUND_TO_POS_INF:
     224        case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
     225  	__r = vec_ceil ((__v4sf) __A);
     226  	break;
     227        case _MM_FROUND_TO_ZERO:
     228        case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
     229  	__r = vec_trunc ((__v4sf) __A);
     230  	break;
     231        case _MM_FROUND_CUR_DIRECTION:
     232  	__r = vec_rint ((__v4sf) __A);
     233  	break;
     234      }
     235    if (__rounding & _MM_FROUND_NO_EXC)
     236      {
     237        /* Insert an artificial "read" reference to the variable written
     238  	 above, to ensure the compiler does not schedule the computation
     239  	 of the value after the manipulation of the FPSCR, below.
     240  	 This can be removed if and when GCC PR102783 is fixed.
     241         */
     242        __asm__ ("" : : "wa" (__r));
     243        /* Restore enabled exceptions.  */
     244        __fpscr_save.__fr = __builtin_mffsl ();
     245        __fpscr_save.__fpscr |= __enables_save.__fpscr;
     246        __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
     247      }
     248    return (__m128) __r;
     249  }
     250  
     251  extern __inline __m128
     252  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     253  _mm_round_ss (__m128 __A, __m128 __B, int __rounding)
     254  {
     255    __B = _mm_round_ps (__B, __rounding);
     256    __v4sf __r = (__v4sf) __A;
     257    __r[0] = ((__v4sf) __B)[0];
     258    return (__m128) __r;
     259  }
     260  
     261  #define _mm_ceil_pd(V)	   _mm_round_pd ((V), _MM_FROUND_CEIL)
     262  #define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
     263  
     264  #define _mm_floor_pd(V)	   _mm_round_pd((V), _MM_FROUND_FLOOR)
     265  #define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
     266  
     267  #define _mm_ceil_ps(V)	   _mm_round_ps ((V), _MM_FROUND_CEIL)
     268  #define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
     269  
     270  #define _mm_floor_ps(V)	   _mm_round_ps ((V), _MM_FROUND_FLOOR)
     271  #define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
     272  
     273  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     274  _mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
     275  {
     276    __v16qi __result = (__v16qi)__A;
     277  
     278    __result [__N & 0xf] = __D;
     279  
     280    return (__m128i) __result;
     281  }
     282  
     283  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     284  _mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
     285  {
     286    __v4si __result = (__v4si)__A;
     287  
     288    __result [__N & 3] = __D;
     289  
     290    return (__m128i) __result;
     291  }
     292  
     293  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     294  _mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
     295  {
     296    __v2di __result = (__v2di)__A;
     297  
     298    __result [__N & 1] = __D;
     299  
     300    return (__m128i) __result;
     301  }
     302  
     303  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     304  _mm_extract_epi8 (__m128i __X, const int __N)
     305  {
     306    return (unsigned char) ((__v16qi)__X)[__N & 15];
     307  }
     308  
     309  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     310  _mm_extract_epi32 (__m128i __X, const int __N)
     311  {
     312    return ((__v4si)__X)[__N & 3];
     313  }
     314  
     315  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     316  _mm_extract_epi64 (__m128i __X, const int __N)
     317  {
     318    return ((__v2di)__X)[__N & 1];
     319  }
     320  
     321  extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     322  _mm_extract_ps (__m128 __X, const int __N)
     323  {
     324    return ((__v4si)__X)[__N & 3];
     325  }
     326  
     327  #ifdef _ARCH_PWR8
     328  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     329  _mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
     330  {
     331    __v16qi __charmask = vec_splats ((signed char) __imm8);
     332    __charmask = vec_gb (__charmask);
     333    __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
     334    #ifdef __BIG_ENDIAN__
     335    __shortmask = vec_reve (__shortmask);
     336    #endif
     337    return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
     338  }
     339  #endif
     340  
     341  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     342  _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
     343  {
     344  #ifdef _ARCH_PWR10
     345    return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
     346  #else
     347    const __v16qu __seven = vec_splats ((unsigned char) 0x07);
     348    __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
     349    return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
     350  #endif
     351  }
     352  
     353  extern __inline __m128
     354  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     355  _mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
     356  {
     357    __v16qu __pcv[] =
     358      {
     359        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
     360        { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
     361        {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
     362        { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
     363        {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
     364        { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 12, 13, 14, 15 },
     365        {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
     366        { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
     367        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
     368        { 16, 17, 18, 19,  4,  5,  6,  7,  8,  9, 10, 11, 28, 29, 30, 31 },
     369        {  0,  1,  2,  3, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
     370        { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 28, 29, 30, 31 },
     371        {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
     372        { 16, 17, 18, 19,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
     373        {  0,  1,  2,  3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
     374        { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
     375      };
     376    __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
     377    return (__m128) __r;
     378  }
     379  
     380  extern __inline __m128
     381  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     382  _mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
     383  {
     384  #ifdef _ARCH_PWR10
     385    return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
     386  #else
     387    const __v4si __zero = {0};
     388    const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
     389    return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
     390  #endif
     391  }
     392  
     393  extern __inline __m128d
     394  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     395  _mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
     396  {
     397    __v16qu __pcv[] =
     398      {
     399        {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
     400        { 16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15 },
     401        {  0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31 },
     402        { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
     403      };
     404    __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
     405    return (__m128d) __r;
     406  }
     407  
     408  #ifdef _ARCH_PWR8
     409  extern __inline __m128d
     410  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     411  _mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
     412  {
     413  #ifdef _ARCH_PWR10
     414    return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
     415  #else
     416    const __v2di __zero = {0};
     417    const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
     418    return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
     419  #endif
     420  }
     421  #endif
     422  
     423  extern __inline int
     424  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     425  _mm_testz_si128 (__m128i __A, __m128i __B)
     426  {
     427    /* Note: This implementation does NOT set "zero" or "carry" flags.  */
     428    const __v16qu __zero = {0};
     429    return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
     430  }
     431  
     432  extern __inline int
     433  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     434  _mm_testc_si128 (__m128i __A, __m128i __B)
     435  {
     436    /* Note: This implementation does NOT set "zero" or "carry" flags.  */
     437    const __v16qu __zero = {0};
     438    const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
     439    return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
     440  }
     441  
     442  extern __inline int
     443  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     444  _mm_testnzc_si128 (__m128i __A, __m128i __B)
     445  {
     446    /* Note: This implementation does NOT set "zero" or "carry" flags.  */
     447    return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
     448  }
     449  
     450  #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
     451  
     452  #define _mm_test_all_ones(V) \
     453    _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
     454  
     455  #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
     456  
     457  #ifdef _ARCH_PWR8
     458  extern __inline __m128i
     459  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     460  _mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
     461  {
     462    return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
     463  }
     464  #endif
     465  
     466  extern __inline __m128i
     467  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     468  _mm_min_epi8 (__m128i __X, __m128i __Y)
     469  {
     470    return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
     471  }
     472  
     473  extern __inline __m128i
     474  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     475  _mm_min_epu16 (__m128i __X, __m128i __Y)
     476  {
     477    return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
     478  }
     479  
     480  extern __inline __m128i
     481  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     482  _mm_min_epi32 (__m128i __X, __m128i __Y)
     483  {
     484    return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
     485  }
     486  
     487  extern __inline __m128i
     488  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     489  _mm_min_epu32 (__m128i __X, __m128i __Y)
     490  {
     491    return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
     492  }
     493  
     494  extern __inline __m128i
     495  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     496  _mm_max_epi8 (__m128i __X, __m128i __Y)
     497  {
     498    return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
     499  }
     500  
     501  extern __inline __m128i
     502  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     503  _mm_max_epu16 (__m128i __X, __m128i __Y)
     504  {
     505    return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
     506  }
     507  
     508  extern __inline __m128i
     509  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     510  _mm_max_epi32 (__m128i __X, __m128i __Y)
     511  {
     512    return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
     513  }
     514  
     515  extern __inline __m128i
     516  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     517  _mm_max_epu32 (__m128i __X, __m128i __Y)
     518  {
     519    return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
     520  }
     521  
     522  extern __inline __m128i
     523  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     524  _mm_mullo_epi32 (__m128i __X, __m128i __Y)
     525  {
     526    return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
     527  }
     528  
     529  #ifdef _ARCH_PWR8
     530  extern __inline __m128i
     531  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     532  _mm_mul_epi32 (__m128i __X, __m128i __Y)
     533  {
     534    return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
     535  }
     536  #endif
     537  
     538  extern __inline __m128i
     539  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     540  _mm_cvtepi8_epi16 (__m128i __A)
     541  {
     542    return (__m128i) vec_unpackh ((__v16qi) __A);
     543  }
     544  
     545  extern __inline __m128i
     546  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     547  _mm_cvtepi8_epi32 (__m128i __A)
     548  {
     549    __A = (__m128i) vec_unpackh ((__v16qi) __A);
     550    return (__m128i) vec_unpackh ((__v8hi) __A);
     551  }
     552  
     553  #ifdef _ARCH_PWR8
     554  extern __inline __m128i
     555  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     556  _mm_cvtepi8_epi64 (__m128i __A)
     557  {
     558    __A = (__m128i) vec_unpackh ((__v16qi) __A);
     559    __A = (__m128i) vec_unpackh ((__v8hi) __A);
     560    return (__m128i) vec_unpackh ((__v4si) __A);
     561  }
     562  #endif
     563  
     564  extern __inline __m128i
     565  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     566  _mm_cvtepi16_epi32 (__m128i __A)
     567  {
     568    return (__m128i) vec_unpackh ((__v8hi) __A);
     569  }
     570  
     571  #ifdef _ARCH_PWR8
     572  extern __inline __m128i
     573  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     574  _mm_cvtepi16_epi64 (__m128i __A)
     575  {
     576    __A = (__m128i) vec_unpackh ((__v8hi) __A);
     577    return (__m128i) vec_unpackh ((__v4si) __A);
     578  }
     579  #endif
     580  
     581  #ifdef _ARCH_PWR8
     582  extern __inline __m128i
     583  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     584  _mm_cvtepi32_epi64 (__m128i __A)
     585  {
     586    return (__m128i) vec_unpackh ((__v4si) __A);
     587  }
     588  #endif
     589  
     590  extern __inline __m128i
     591  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     592  _mm_cvtepu8_epi16 (__m128i __A)
     593  {
     594    const __v16qu __zero = {0};
     595  #ifdef __LITTLE_ENDIAN__
     596    __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
     597  #else /* __BIG_ENDIAN__.  */
     598    __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
     599  #endif /* __BIG_ENDIAN__.  */
     600    return __A;
     601  }
     602  
     603  extern __inline __m128i
     604  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     605  _mm_cvtepu8_epi32 (__m128i __A)
     606  {
     607    const __v16qu __zero = {0};
     608  #ifdef __LITTLE_ENDIAN__
     609    __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
     610    __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
     611  #else /* __BIG_ENDIAN__.  */
     612    __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
     613    __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
     614  #endif /* __BIG_ENDIAN__.  */
     615    return __A;
     616  }
     617  
     618  extern __inline __m128i
     619  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     620  _mm_cvtepu8_epi64 (__m128i __A)
     621  {
     622    const __v16qu __zero = {0};
     623  #ifdef __LITTLE_ENDIAN__
     624    __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
     625    __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
     626    __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
     627  #else /* __BIG_ENDIAN__.  */
     628    __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
     629    __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
     630    __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
     631  #endif /* __BIG_ENDIAN__.  */
     632    return __A;
     633  }
     634  
     635  extern __inline __m128i
     636  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     637  _mm_cvtepu16_epi32 (__m128i __A)
     638  {
     639    const __v8hu __zero = {0};
     640  #ifdef __LITTLE_ENDIAN__
     641    __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
     642  #else /* __BIG_ENDIAN__.  */
     643    __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
     644  #endif /* __BIG_ENDIAN__.  */
     645    return __A;
     646  }
     647  
     648  extern __inline __m128i
     649  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     650  _mm_cvtepu16_epi64 (__m128i __A)
     651  {
     652    const __v8hu __zero = {0};
     653  #ifdef __LITTLE_ENDIAN__
     654    __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
     655    __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
     656  #else /* __BIG_ENDIAN__.  */
     657    __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
     658    __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
     659  #endif /* __BIG_ENDIAN__.  */
     660    return __A;
     661  }
     662  
     663  extern __inline __m128i
     664  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     665  _mm_cvtepu32_epi64 (__m128i __A)
     666  {
     667    const __v4su __zero = {0};
     668  #ifdef __LITTLE_ENDIAN__
     669    __A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
     670  #else /* __BIG_ENDIAN__.  */
     671    __A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
     672  #endif /* __BIG_ENDIAN__.  */
     673    return __A;
     674  }
     675  
     676  /* Return horizontal packed word minimum and its index in bits [15:0]
     677     and bits [18:16] respectively.  */
     678  extern __inline __m128i
     679  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     680  _mm_minpos_epu16 (__m128i __A)
     681  {
     682    union __u
     683      {
     684        __m128i __m;
     685        __v8hu __uh;
     686      };
     687    union __u __u = { .__m = __A }, __r = { .__m = {0} };
     688    unsigned short __ridx = 0;
     689    unsigned short __rmin = __u.__uh[__ridx];
     690    unsigned long __i;
     691    for (__i = 1; __i < 8; __i++)
     692      {
     693        if (__u.__uh[__i] < __rmin)
     694  	{
     695  	  __rmin = __u.__uh[__i];
     696  	  __ridx = __i;
     697  	}
     698      }
     699    __r.__uh[0] = __rmin;
     700    __r.__uh[1] = __ridx;
     701    return __r.__m;
     702  }
     703  
     704  extern __inline __m128i
     705  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     706  _mm_packus_epi32 (__m128i __X, __m128i __Y)
     707  {
     708    return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
     709  }
     710  
     711  #ifdef _ARCH_PWR8
     712  extern __inline __m128i
     713  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     714  _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
     715  {
     716    return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
     717  }
     718  #endif
     719  
     720  #endif