(root)/
gcc-13.2.0/
gcc/
testsuite/
gcc.target/
i386/
pr104441-1a.c
       1  /* { dg-do compile } */
       2  /* { dg-options "-O3 -march=x86-64 -mtune=skylake -Wno-attributes" } */
       3  
       4  #include <x86intrin.h>
       5  #include <stdint.h>
       6  
       7  __attribute__((always_inline, target("avx2")))
       8  static __m256i
       9  load8bit_4x4_avx2(const uint8_t *const src, const uint32_t stride)
      10  {
      11    __m128i src01, src23 = _mm_setzero_si128();
      12    src01 = _mm_cvtsi32_si128(*(int32_t*)(src + 0 * stride));
      13    src23 = _mm_insert_epi32(src23, *(int32_t *)(src + 3 * stride), 1);
      14    return _mm256_setr_m128i(src01, src23);
      15  }
      16  
      17  __attribute__ ((noinline, noipa, target("avx2")))
      18  uint32_t
      19  compute4x_m_sad_avx2_intrin(uint8_t *src, uint32_t src_stride,
      20  			    uint8_t *ref, uint32_t ref_stride,
      21  			    uint32_t height)
      22  {
      23    __m128i xmm0;
      24    __m256i ymm = _mm256_setzero_si256();
      25    uint32_t y;
      26  
      27    for (y = 0; y < height; y += 4) {
      28      const __m256i src0123 = load8bit_4x4_avx2(src, src_stride);
      29      const __m256i ref0123 = load8bit_4x4_avx2(ref, ref_stride);
      30      ymm = _mm256_add_epi32(ymm, _mm256_sad_epu8(src0123, ref0123));
      31      src += src_stride << 2;
      32      ref += ref_stride << 2;
      33    }
      34  
      35    xmm0 = _mm_add_epi32(_mm256_castsi256_si128(ymm),
      36  		       _mm256_extracti128_si256(ymm, 1));
      37  
      38    return (uint32_t)_mm_cvtsi128_si32(xmm0);
      39  }  
      40  
      41  /* Expect assembly like:
      42  
      43  	vextracti128	$0x1, %ymm3, %xmm3
      44  	vpaddd	%xmm3, %xmm0, %xmm0
      45  	vmovd	%xmm0, %eax
      46  	vzeroupper
      47  
      48  rather than:
      49  
      50  	vzeroupper
      51  	vextracti128	$0x1, %ymm3, %xmm3
      52  	vpaddd	%xmm3, %xmm0, %xmm0
      53  	vmovd	%xmm0, %eax
      54  
      55   */
      56  
      57  /* { dg-final { scan-assembler "\[ \t\]+vextracti128\[ \t\]+\[^\n\]+\n\[ \t\]+vpaddd\[ \t\]+\[^\n\]+\n\[ \t\]+vmovd\[ \t\]+\[^\n\]+\n\[ \t\]+vzeroupper" } } */