(root)/
gcc-13.2.0/
gcc/
config/
i386/
avx512bf16vlintrin.h
       1  /* Copyright (C) 2019-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3, or (at your option)
       8     any later version.
       9  
      10     GCC is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  #ifndef _IMMINTRIN_H_INCLUDED
      25  #error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead."
      26  #endif
      27  
      28  #ifndef _AVX512BF16VLINTRIN_H_INCLUDED
      29  #define _AVX512BF16VLINTRIN_H_INCLUDED
      30  
      31  #if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
      32  #pragma GCC push_options
      33  #pragma GCC target("avx512bf16,avx512vl")
      34  #define __DISABLE_AVX512BF16VL__
      35  #endif /* __AVX512BF16__ */
      36  
      37  /* Internal data types for implementing the intrinsics.  */
      38  typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
      39  typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
      40  
      41  /* The Intel API is flexible enough that we must allow aliasing with other
      42     vector types, and their scalar components.  */
      43  typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
      44  typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
      45  
      46  typedef __bf16 __bfloat16;
      47  
      48  #define _mm256_cvtneps_pbh(A) \
      49    (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A)
      50  #define _mm_cvtneps_pbh(A) \
      51    (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (A)
      52  
      53  /* vcvtne2ps2bf16 */
      54  
      55  extern __inline __m256bh
      56  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      57  _mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
      58  {
      59    return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
      60  }
      61  
      62  extern __inline __m256bh
      63  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      64  _mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
      65  {
      66    return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
      67  }
      68  
      69  extern __inline __m256bh
      70  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      71  _mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
      72  {
      73    return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
      74  }
      75  
      76  extern __inline __m128bh
      77  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      78  _mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
      79  {
      80    return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
      81  }
      82  
      83  extern __inline __m128bh
      84  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      85  _mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
      86  {
      87    return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
      88  }
      89  
      90  extern __inline __m128bh
      91  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      92  _mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
      93  {
      94    return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
      95  }
      96  
      97  /* vcvtneps2bf16 */
      98  
      99  extern __inline __m128bh
     100  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     101  _mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C)
     102  {
     103    return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B);
     104  }
     105  
     106  extern __inline __m128bh
     107  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     108  _mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
     109  {
     110    return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A);
     111  }
     112  
     113  extern __inline __m128bh
     114  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     115  _mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
     116  {
     117    return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
     118  }
     119  
     120  extern __inline __m128bh
     121  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     122  _mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B)
     123  {
     124    return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A);
     125  }
     126  
     127  /* vdpbf16ps */
     128  
     129  extern __inline __m256
     130  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     131  _mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C)
     132  {
     133    return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C);
     134  }
     135  
     136  extern __inline __m256
     137  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     138  _mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D)
     139  {
     140    return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B);
     141  }
     142  
     143  extern __inline __m256
     144  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     145  _mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D)
     146  {
     147    return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A);
     148  }
     149  
     150  extern __inline __m128
     151  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     152  _mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C)
     153  {
     154    return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C);
     155  }
     156  
     157  extern __inline __m128
     158  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     159  _mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D)
     160  {
     161    return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B);
     162  }
     163  
     164  extern __inline __m128
     165  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     166  _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
     167  {
     168    return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
     169  }
     170  
     171  extern __inline __bf16
     172  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     173  _mm_cvtness_sbh (float __A)
     174  {
     175    __v4sf __V = {__A, 0, 0, 0};
     176    __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
     177  	       (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
     178    return __R[0];
     179  }
     180  
     181  extern __inline __m128
     182  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     183  _mm_cvtpbh_ps (__m128bh __A)
     184  {
     185    return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
     186  	 (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
     187  }
     188  
     189  extern __inline __m256
     190  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     191  _mm256_cvtpbh_ps (__m128bh __A)
     192  {
     193    return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
     194  	 (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
     195  }
     196  
     197  extern __inline __m128
     198  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     199  _mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
     200  {
     201    return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
     202  	 (__m128i)_mm_maskz_cvtepi16_epi32 (
     203  	 (__mmask8)__U, (__m128i)__A), 16));
     204  }
     205  
     206  extern __inline __m256
     207  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     208  _mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
     209  {
     210    return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
     211  	 (__m256i)_mm256_maskz_cvtepi16_epi32 (
     212  	 (__mmask8)__U, (__m128i)__A), 16));
     213  }
     214  
     215  extern __inline __m128
     216  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     217  _mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
     218  {
     219    return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
     220  	 (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
     221  	 (__m128i)__A), 16));
     222  }
     223  
     224  extern __inline __m256
     225  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
     226  _mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
     227  {
     228    return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
     229  	 (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
     230  	 (__m128i)__A), 16));
     231  }
     232  
     233  #ifdef __DISABLE_AVX512BF16VL__
     234  #undef __DISABLE_AVX512BF16VL__
     235  #pragma GCC pop_options
     236  #endif /* __DISABLE_AVX512BF16VL__ */
     237  
     238  #endif /* _AVX512BF16VLINTRIN_H_INCLUDED */