1  /* { dg-options "-O2 -msve-vector-bits=512" } */
       2  
       3  typedef int int32x16_t __attribute__((vector_size(64)));
       4  typedef int int32x8_t __attribute__((vector_size(32)));
       5  
       6  int32x8_t
       7  f1 (int32x16_t x)
       8  {
       9    union u { int32x16_t full; int32x8_t pair[2]; } u;
      10    u.full = x | 2;
      11    return u.pair[0] + (int32x8_t) { 1, 2, 3, 4, 5, 6, 7, 8 };
      12  }
      13  
      14  int32x8_t
      15  f2 (int32x16_t x)
      16  {
      17    union u { int32x16_t full; int32x8_t pair[2]; } u;
      18    u.full = x | 2;
      19    return u.pair[1] + (int32x8_t) { 1, 2, 3, 4, 5, 6, 7, 8 };
      20  }
      21  
      22  /* We could do something more efficient than spill the int32x16_t and
      23     reload the int32x8_t.  The important thing is that we don't do
      24     something like:
      25  
      26  	orr	z0.s, z0.s, #2
      27  	index	z1.d, #1, #1
      28  	add	z0.s, z0.s, z1.s
      29  	st1w	z0.d, p0, [x8]
      30  
      31     We're supposed to add z1 to one half of the ORR result instead.  */
      32  /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 2 } } */
      33  /* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d} 2 } } */
      34  /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d} 2 } } */