1  /* { dg-do assemble } */
       2  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
       3  /* { dg-add-options arm_v8_1m_mve_fp } */
       4  /* { dg-additional-options "-O3" } */
       5  
       6  #include <stdint.h>
       7  
       8  #define M00 100
       9  #define M10 216
      10  #define M01 1322
      11  #define M11 13
      12  
      13  #define N 128
      14  
      15  
      16  /* Integer tests.  */
      17  #define FUNC(SIGN, TYPE, BITS)						\
      18    void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
      19  			    TYPE##BITS##_t *__restrict__ pOutput)	\
      20    {									\
      21      unsigned int i;							\
      22      TYPE##BITS##_t  a, b;						\
      23      									\
      24      for (i = 0; i < N / BITS; i++)					\
      25        {									\
      26  	a = *pInput++;							\
      27  	b = *pInput++;							\
      28  									\
      29  	*pOutput++ = M00 * a + M01 * b;					\
      30  	*pOutput++ = M10 * a + M11 * b;					\
      31        }									\
      32    }
      33  
      34  FUNC(s, int, 8)
      35  FUNC(u, uint, 8)
      36  FUNC(s, int, 16)
      37  FUNC(u, uint, 16)
      38  FUNC(s, int, 32)
      39  FUNC(u, uint, 32)
      40  
      41  /* float test, keep the macro because it's similar to the above, but does not
      42     need the ##BITS##_t.  */
      43  #define FUNC_FLOAT(SIGN, TYPE, BITS)					\
      44    void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
      45  			    TYPE *__restrict__ pOutput)			\
      46    {									\
      47      unsigned int i;							\
      48      TYPE a, b;								\
      49      									\
      50      for (i = 0; i < N / BITS; i++)					\
      51        {									\
      52  	a = *pInput++;							\
      53  	b = *pInput++;							\
      54  									\
      55  	*pOutput++ = M00 * a + M01 * b;					\
      56  	*pOutput++ = M10 * a + M11 * b;					\
      57        }									\
      58    }
      59  
      60  FUNC_FLOAT(f, float, 32)
      61  
      62  /* __fp16 test, needs explicit casts to avoid conversions to floating-point and
      63     failure to vectorize.  */
      64  __fp16 M00_fp16 = 100.0f16;
      65  __fp16 M10_fp16 = 216.0f16;
      66  __fp16 M01_fp16 = 1322.0f16;
      67  __fp16 M11_fp16 = 13.0f16;
      68  
      69  #define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
      70    void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
      71  			    TYPE *__restrict__ pOutput)			\
      72    {									\
      73      unsigned int i;							\
      74      TYPE a, b;								\
      75      									\
      76      for (i = 0; i < N / BITS; i++)					\
      77        {									\
      78  	a = *pInput++;							\
      79  	b = *pInput++;							\
      80  									\
      81  	*pOutput++ = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);	\
      82  	*pOutput++ = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);	\
      83        }									\
      84    }
      85  
      86  FUNC_FLOAT_FP16(f, __fp16, 16)
      87  
      88  /* vld2X.8 is used for signed and unsigned chars: 2 pairs.  */
      89  /* vld2X.16 is used for signed and unsigned shorts and __fp16: 3 pairs.  */
      90  /* vld2X.32 is used for signed and unsigned ints and float: 3 pairs.  */
      91  /* { dg-final { scan-assembler-times {vld2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
      92  /* { dg-final { scan-assembler-times {vld2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
      93  /* { dg-final { scan-assembler-times {vld2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */
      94  /* { dg-final { scan-assembler-times {vst2[01].8\t.q[0-9]+, q[0-9]+., } 4 } } */
      95  /* { dg-final { scan-assembler-times {vst2[01].16\t.q[0-9]+, q[0-9]+., } 6 } } */
      96  /* { dg-final { scan-assembler-times {vst2[01].32\t.q[0-9]+, q[0-9]+., } 6 } } */