1  /* { dg-do compile } */
       2  /* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -ffast-math" } */
       3  
       4  #include <stdint.h>
       5  
       6  #define VEC_PERM(TYPE)						\
       7  void __attribute__ ((noinline, noclone))			\
       8  vec_slp_##TYPE (TYPE *restrict a, TYPE *restrict b, int n)	\
       9  {								\
      10    TYPE x0 = b[0];						\
      11    TYPE x1 = b[1];						\
      12    TYPE x2 = b[2];						\
      13    TYPE x3 = b[3];						\
      14    for (int i = 0; i < n; ++i)					\
      15      {								\
      16        x0 += a[i * 4];						\
      17        x1 += a[i * 4 + 1];					\
      18        x2 += a[i * 4 + 2];					\
      19        x3 += a[i * 4 + 3];					\
      20      }								\
      21    b[0] = x0;							\
      22    b[1] = x1;							\
      23    b[2] = x2;							\
      24    b[3] = x3;							\
      25  }
      26  
      27  #define TEST_ALL(T)				\
      28    T (int8_t)					\
      29    T (uint8_t)					\
      30    T (int16_t)					\
      31    T (uint16_t)					\
      32    T (int32_t)					\
      33    T (uint32_t)					\
      34    T (_Float16)					\
      35    T (float)
      36  
      37  TEST_ALL (VEC_PERM)
      38  
      39  /* We have two loads per loop, one for the initial vector and one for
      40     the loop body.  */
      41  /* { dg-final { scan-assembler-times {\tld1b\t} 2 } } */
      42  /* { dg-final { scan-assembler-times {\tld1h\t} 3 } } */
      43  /* { dg-final { scan-assembler-times {\tld1w\t} 3 } } */
      44  /* { dg-final { scan-assembler-not {\tld4b\t} } } */
      45  /* { dg-final { scan-assembler-not {\tld4h\t} } } */
      46  /* { dg-final { scan-assembler-not {\tld4w\t} } } */
      47  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b} 8 } } */
      48  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h} 8 } } */
      49  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s} 8 } } */
      50  /* { dg-final { scan-assembler-times {\tfaddv\th[0-9]+, p[0-7], z[0-9]+\.h} 4 } } */
      51  /* { dg-final { scan-assembler-times {\tfaddv\ts[0-9]+, p[0-7], z[0-9]+\.s} 4 } } */
      52  
      53  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 } } */
      54  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 } } */
      55  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
      56  
      57  /* { dg-final { scan-assembler-not {\tuqdec} } } */