1  /* { dg-do assemble } */
       2  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
       3  /* { dg-add-options arm_v8_1m_mve_fp } */
       4  /* { dg-additional-options "-O3" } */
       5  
       6  #include <stdint.h>
       7  
       8  #define M00 100
       9  #define M10 216
      10  #define M20 23
      11  #define M30 237
      12  #define M01 1322
      13  #define M11 13
      14  #define M21 27271
      15  #define M31 2280
      16  #define M02 74
      17  #define M12 191
      18  #define M22 500
      19  #define M32 111
      20  #define M03 134
      21  #define M13 117
      22  #define M23 11
      23  #define M33 771
      24  
      25  #define N 128
      26  
      27  /* Integer tests.  */
      28  #define FUNC(SIGN, TYPE, BITS)						\
      29    void foo_##SIGN##BITS##x (TYPE##BITS##_t *__restrict__ pInput,	\
      30  			    TYPE##BITS##_t *__restrict__ pOutput)	\
      31    {									\
      32      unsigned int i;							\
      33      TYPE##BITS##_t  a, b, c, d;						\
      34      									\
      35      for (i = 0; i < N / BITS; i++)					\
      36        {									\
      37  	a = *pInput++;							\
      38  	b = *pInput++;							\
      39  	c = *pInput++;							\
      40  	d = *pInput++;							\
      41  									\
      42  	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
      43  	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
      44  	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
      45  	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
      46        }									\
      47    }
      48  
      49  FUNC(s, int, 8)
      50  FUNC(u, uint, 8)
      51  FUNC(s, int, 16)
      52  FUNC(u, uint, 16)
      53  FUNC(s, int, 32)
      54  FUNC(u, uint, 32)
      55  
      56  /* float test, keep the macro because it's similar to the above, but does not
      57     need the ##BITS##_t.  */
      58  #define FUNC_FLOAT(SIGN, TYPE, BITS)						\
      59    void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
      60  			    TYPE *__restrict__ pOutput)			\
      61    {									\
      62      unsigned int i;							\
      63      TYPE a, b, c, d;							\
      64      									\
      65      for (i = 0; i < N / BITS; i++)					\
      66        {									\
      67  	a = *pInput++;							\
      68  	b = *pInput++;							\
      69  	c = *pInput++;							\
      70  	d = *pInput++;							\
      71  									\
      72  	*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;		\
      73  	*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;		\
      74  	*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;		\
      75  	*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;		\
      76        }									\
      77    }
      78  
      79  FUNC_FLOAT(f, float, 32)
      80  
      81  /* __fp16 test, needs explicit casts to avoid conversions to floating-point and
      82     failure to vectorize.  */
      83  __fp16 M00_fp16 = 100.0f16;
      84  __fp16 M10_fp16 = 216.0f16;
      85  __fp16 M20_fp16 = 23.0f16;
      86  __fp16 M30_fp16 = 237.0f16;
      87  __fp16 M01_fp16 = 1322.0f16;
      88  __fp16 M11_fp16 = 13.0f16;
      89  __fp16 M21_fp16 = 27271.0f16;
      90  __fp16 M31_fp16 = 2280.0f16;
      91  __fp16 M02_fp16 = 74.0f16;
      92  __fp16 M12_fp16 = 191.0f16;
      93  __fp16 M22_fp16 = 500.0f16;
      94  __fp16 M32_fp16 = 111.0f16;
      95  __fp16 M03_fp16 = 134.0f16;
      96  __fp16 M13_fp16 = 117.0f16;
      97  __fp16 M23_fp16 = 11.0f16;
      98  __fp16 M33_fp16 = 771.0f16;
      99  
     100  #define FUNC_FLOAT_FP16(SIGN, TYPE, BITS)				\
     101    void foo_##SIGN##BITS##x (TYPE *__restrict__ pInput,			\
     102  			    TYPE *__restrict__ pOutput)			\
     103    {									\
     104      unsigned int i;							\
     105      TYPE a, b, c, d;							\
     106      									\
     107      for (i = 0; i < N / BITS; i++)					\
     108        {									\
     109  	a = *pInput++;							\
     110  	b = *pInput++;							\
     111  	c = *pInput++;							\
     112  	d = *pInput++;							\
     113  									\
     114  	TYPE ab, cd;							\
     115  	ab = (__fp16)(M00_fp16 * a) + (__fp16)(M01_fp16 * b);		\
     116  	cd = (__fp16)(M02_fp16 * c) + (__fp16)(M03_fp16 * d);		\
     117  	*pOutput++ = ab + cd;						\
     118  	ab = (__fp16)(M10_fp16 * a) + (__fp16)(M11_fp16 * b);		\
     119  	cd = (__fp16)(M12_fp16 * c) + (__fp16)(M13_fp16 * d);		\
     120  	*pOutput++ = ab + cd;						\
     121  	ab = (__fp16)(M20_fp16 * a) + (__fp16)(M21_fp16 * b);		\
     122  	cd = (__fp16)(M22_fp16 * c) + (__fp16)(M23_fp16 * d);		\
     123  	*pOutput++ = ab + cd;						\
     124  	ab = (__fp16)(M30_fp16 * a) + (__fp16)(M31_fp16 * b);		\
     125  	cd = (__fp16)(M32_fp16 * c) + (__fp16)(M33_fp16 * d);		\
     126  	*pOutput++ = ab + cd;						\
     127        }									\
     128    }
     129  
     130  FUNC_FLOAT_FP16(f, __fp16, 16)
     131  
     132  /* vld4X.8 is used for signed and unsigned chars: 2 * 4.  */
     133  /* vld4X.16 is used for signed and unsigned shorts and __fp16: 3 * 4.  */
     134  /* vld4X.32 is used for signed and unsigned ints and float: 3 * 4.  */
     135  /* { dg-final { scan-assembler-times {vld4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
     136  /* { dg-final { scan-assembler-times {vld4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
     137  /* { dg-final { scan-assembler-times {vld4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
     138  /* { dg-final { scan-assembler-times {vst4[0123].8\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 8 } } */
     139  /* { dg-final { scan-assembler-times {vst4[0123].16\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */
     140  /* { dg-final { scan-assembler-times {vst4[0123].32\t.q[0-9]+, q[0-9]+, q[0-9]+, q[0-9]+., } 12 } } */