1  /* { dg-do compile } */
       2  /* { dg-options "-O2 -ftree-vectorize --param aarch64-sve-compare-costs=0" } */
       3  
       4  #include <stdint.h>
       5  
       6  #define TEST(TYPE, NAME, OP)				\
       7    void __attribute__ ((noinline, noclone))		\
       8    test_##TYPE##_##NAME (TYPE *__restrict x,		\
       9  			TYPE *__restrict y,		\
      10  			TYPE *__restrict z,		\
      11  			TYPE *__restrict pred, int n)	\
      12    {							\
      13      for (int i = 0; i < n; ++i)				\
      14        x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i];	\
      15    }
      16  
      17  #define TEST_INT_TYPE(TYPE) \
      18    TEST (TYPE, div, /)
      19  
      20  #define TEST_FP_TYPE(TYPE) \
      21    TEST (TYPE, add, +) \
      22    TEST (TYPE, sub, -) \
      23    TEST (TYPE, mul, *) \
      24    TEST (TYPE, div, /)
      25  
      26  #define TEST_ALL \
      27    TEST_INT_TYPE (int8_t) \
      28    TEST_INT_TYPE (uint8_t) \
      29    TEST_INT_TYPE (int16_t) \
      30    TEST_INT_TYPE (uint16_t) \
      31    TEST_INT_TYPE (int32_t) \
      32    TEST_INT_TYPE (uint32_t) \
      33    TEST_INT_TYPE (int64_t) \
      34    TEST_INT_TYPE (uint64_t) \
      35    TEST_FP_TYPE (float) \
      36    TEST_FP_TYPE (double)
      37  
      38  TEST_ALL
      39  
      40  /* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */		\
      41  /* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */		\
      42  /* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
      43  /* At present we don't vectorize the uint8_t or uint16_t loops because the
      44     division is done directly in the narrow type, rather than being widened
      45     to int first.  */
      46  /* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
      47  /* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
      48  /* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
      49  
      50  /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
      51  /* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
      52  
      53  /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
      54  /* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
      55  
      56  /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
      57  /* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
      58  
      59  /* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
      60  /* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
      61  
      62  /* We fail to optimize away the SEL for the int8_t and int16_t loops,
      63     because the 32-bit result is converted before selection.  */
      64  /* { dg-final { scan-assembler-times {\tsel\t} 2 } } */