1  /* Copyright (C) 2011-2023 Free Software Foundation, Inc.
       2  
       3     This file is part of GCC.
       4  
       5     GCC is free software; you can redistribute it and/or modify
       6     it under the terms of the GNU General Public License as published by
       7     the Free Software Foundation; either version 3, or (at your option)
       8     any later version.
       9  
      10     GCC is distributed in the hope that it will be useful,
      11     but WITHOUT ANY WARRANTY; without even the implied warranty of
      12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      13     GNU General Public License for more details.
      14  
      15     Under Section 7 of GPL version 3, you are granted additional
      16     permissions described in the GCC Runtime Library Exception, version
      17     3.1, as published by the Free Software Foundation.
      18  
      19     You should have received a copy of the GNU General Public License and
      20     a copy of the GCC Runtime Library Exception along with this program;
      21     see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      22     <http://www.gnu.org/licenses/>.  */
      23  
      24  /* This header is distributed to simplify porting x86_64 code that
      25     makes explicit use of Intel intrinsics to powerpc64le.
      26     It is the user's responsibility to determine if the results are
      27     acceptable and make additional changes as necessary.
      28     Note that much code that uses Intel intrinsics can be rewritten in
      29     standard C or GNU C extensions, which are more portable and better
      30     optimized across multiple targets.  */
      31  
      32  #if !defined _X86GPRINTRIN_H_INCLUDED
      33  # error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
      34  #endif
      35  
      36  #ifndef _BMI2INTRIN_H_INCLUDED
      37  #define _BMI2INTRIN_H_INCLUDED
      38  
      39  extern __inline unsigned int
      40  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      41  _bzhi_u32 (unsigned int __X, unsigned int __Y)
      42  {
      43    return ((__X << (32 - __Y)) >> (32 - __Y));
      44  }
      45  
      46  extern __inline unsigned int
      47  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      48  _mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
      49  {
      50    unsigned long long __res = (unsigned long long) __X * __Y;
      51    *__P = (unsigned int) (__res >> 32);
      52    return (unsigned int) __res;
      53  }
      54  
      55  #ifdef  __PPC64__
      56  extern __inline unsigned long long
      57  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      58  _bzhi_u64 (unsigned long long __X, unsigned long long __Y)
      59  {
      60    return ((__X << (64 - __Y)) >> (64 - __Y));
      61  }
      62  
      63  /* __int128 requires base 64-bit.  */
      64  extern __inline unsigned long long
      65  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      66  _mulx_u64 (unsigned long long __X, unsigned long long __Y,
      67  	   unsigned long long *__P)
      68  {
      69    unsigned __int128 __res = (unsigned __int128) __X * __Y;
      70    *__P = (unsigned long long) (__res >> 64);
      71    return (unsigned long long) __res;
      72  }
      73  
      74  #ifdef  _ARCH_PWR7
      75  /* popcount and bpermd require power7 minimum.  */
      76  extern __inline unsigned long long
      77  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
      78  _pdep_u64 (unsigned long long __X, unsigned long long __M)
      79  {
      80    unsigned long __result = 0x0UL;
      81    const unsigned long __mask = 0x8000000000000000UL;
      82    unsigned long __m = __M;
      83    unsigned long __c, __t;
      84    unsigned long __p;
      85  
      86    /* The pop-count of the mask gives the number of the bits from
      87     source to process.  This is also needed to shift bits from the
      88     source into the correct position for the result.  */
      89    __p = 64 - __builtin_popcountl (__M);
      90  
      91    /* The loop is for the number of '1' bits in the mask and clearing
      92     each mask bit as it is processed.  */
      93    while (__m != 0)
      94      {
      95        __c = __builtin_clzl (__m);
      96        __t = __X << (__p - __c);
      97        __m ^= (__mask >> __c);
      98        __result |= (__t & (__mask >> __c));
      99        __p++;
     100      }
     101    return __result;
     102  }
     103  
     104  extern __inline unsigned long long
     105  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     106  _pext_u64 (unsigned long long __X, unsigned long long __M)
     107  {
     108    unsigned long __p = 0x4040404040404040UL; // initial bit permute control
     109    const unsigned long __mask = 0x8000000000000000UL;
     110    unsigned long __m = __M;
     111    unsigned long __c;
     112    unsigned long __result;
     113  
     114    /* if the mask is constant and selects 8 bits or less we can use
     115     the Power8 Bit permute instruction.  */
     116    if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
     117      {
     118        /* Also if the pext mask is constant, then the popcount is
     119         constant, we can evaluate the following loop at compile
     120         time and use a constant bit permute vector.  */
     121        long __i;
     122        for (__i = 0; __i < __builtin_popcountl (__M); __i++)
     123  	{
     124  	  __c = __builtin_clzl (__m);
     125  	  __p = (__p << 8) | __c;
     126  	  __m ^= (__mask >> __c);
     127  	}
     128        __result = __builtin_bpermd (__p, __X);
     129      }
     130    else
     131      {
     132        __p = 64 - __builtin_popcountl (__M);
     133        __result = 0;
     134        /* We could a use a for loop here, but that combined with
     135         -funroll-loops can expand to a lot of code.  The while
     136         loop avoids unrolling and the compiler commons the xor
     137         from clearing the mask bit with the (m != 0) test.  The
     138         result is a more compact loop setup and body.  */
     139        while (__m != 0)
     140  	{
     141  	  unsigned long __t;
     142  	  __c = __builtin_clzl (__m);
     143  	  __t = (__X & (__mask >> __c)) >> (__p - __c);
     144  	  __m ^= (__mask >> __c);
     145  	  __result |= (__t);
     146  	  __p++;
     147  	}
     148      }
     149    return __result;
     150  }
     151  
     152  /* these 32-bit implementations depend on 64-bit pdep/pext
     153     which depend on _ARCH_PWR7.  */
     154  extern __inline unsigned int
     155  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     156  _pdep_u32 (unsigned int __X, unsigned int __Y)
     157  {
     158    return _pdep_u64 (__X, __Y);
     159  }
     160  
     161  extern __inline unsigned int
     162  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     163  _pext_u32 (unsigned int __X, unsigned int __Y)
     164  {
     165    return _pext_u64 (__X, __Y);
     166  }
     167  #endif /* _ARCH_PWR7  */
     168  #endif /* __PPC64__  */
     169  
     170  #endif /* _BMI2INTRIN_H_INCLUDED */