1  /* { dg-do compile } */
       2  /* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
       3  /* { dg-options "-O3 -mpower8-vector" } */
       4  /* { dg-require-effective-target p8vector_hw } */
       5  
       6  #define NO_WARN_X86_INTRINSICS 1
       7  
       8  #include <mmintrin.h>
       9  #include <stddef.h>
      10  #include <stdint.h>
      11  
      12  #if 0
      13  extern const uint64_t ff_bone;
      14  #endif
      15  
      16  static inline void transpose4x4(uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride) {
      17    __m64 row0 = _mm_cvtsi32_si64(*(unsigned*)(src + (0 * src_stride)));
      18    __m64 row1 = _mm_cvtsi32_si64(*(unsigned*)(src + (1 * src_stride)));
      19    __m64 row2 = _mm_cvtsi32_si64(*(unsigned*)(src + (2 * src_stride)));
      20    __m64 row3 = _mm_cvtsi32_si64(*(unsigned*)(src + (3 * src_stride)));
      21    __m64 tmp0 = _mm_unpacklo_pi8(row0, row1);
      22    __m64 tmp1 = _mm_unpacklo_pi8(row2, row3);
      23    __m64 row01 = _mm_unpacklo_pi16(tmp0, tmp1);
      24    __m64 row23 = _mm_unpackhi_pi16(tmp0, tmp1);
      25    *((unsigned*)(dst + (0 * dst_stride))) = _mm_cvtsi64_si32(row01);
      26    *((unsigned*)(dst + (1 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row01, row01));
      27    *((unsigned*)(dst + (2 * dst_stride))) = _mm_cvtsi64_si32(row23);
      28    *((unsigned*)(dst + (3 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row23, row23));
      29  }
      30  
      31  #if 0
      32  static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
      33  {
      34      asm volatile(
      35          ""
      36          :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
      37             "m"(alpha1), "m"(beta1), "m"(ff_bone)
      38      );
      39  }
      40  #endif
      41  
      42  void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
      43  {
      44    uint8_t trans[8*4] __attribute__ ((aligned (8)));
      45    transpose4x4(trans, pix-2, 8, stride);
      46    transpose4x4(trans+4, pix-2+4*stride, 8, stride);
      47  //    h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
      48    transpose4x4(pix-2, trans, stride, 8);
      49    transpose4x4(pix-2+4*stride, trans+4, stride, 8);
      50  }