1  /* { dg-do compile  { target { lp64 } } } */
       2  /* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
       3  /* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
       4  
       5  #include <arm_neon.h>
       6  
       7  /*
       8  **test1:
       9  **	adrp	x[0-9]+, .LC[0-9]+
      10  **	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
      11  **	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
      12  **	str	q[0-9]+, \[x[0-9]+\]
      13  **	fmov	x[0-9]+, d[0-9]+
      14  **	orr	x[0-9]+, x[0-9]+, x[0-9]+
      15  **	ret
      16  */
      17  
      18  uint64_t
      19  test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
      20  {
      21    uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
      22    uint64_t res = a | arr[0];
      23    uint64x2_t val = vld1q_u64 (arr);
      24    *rt = vaddq_u64 (val, b);
      25    return res;
      26  }
      27  
      28  /*
      29  **test2:
      30  **	adrp	x[0-9]+, .LC[0-1]+
      31  **	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
      32  **	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
      33  **	str	q[0-9]+, \[x[0-9]+\]
      34  **	fmov	x[0-9]+, d[0-9]+
      35  **	orr	x[0-9]+, x[0-9]+, x[0-9]+
      36  **	ret
      37  */
      38  
      39  uint64_t
      40  test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
      41  {
      42    uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
      43    uint64_t arr = vgetq_lane_u64 (val, 0);
      44    uint64_t res = a | arr;
      45    *rt = vaddq_u64 (val, b);
      46    return res;
      47  }
      48  
      49  /*
      50  **test3:
      51  **	adrp	x[0-9]+, .LC[0-9]+
      52  **	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
      53  **	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
      54  **	str	q[0-9]+, \[x1\]
      55  **	fmov	w[0-9]+, s[0-9]+
      56  **	orr	w[0-9]+, w[0-9]+, w[0-9]+
      57  **	ret
      58  */
      59  
      60  uint32_t
      61  test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
      62  {
      63    uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
      64    uint32_t res = a | arr[0];
      65    uint32x4_t val = vld1q_u32 (arr);
      66    *rt = vaddq_u32 (val, b);
      67    return res;
      68  }
      69  
      70  /*
      71  **test4:
      72  **	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
      73  **	mov	x[0-9]+, 16512
      74  **	movk	x[0-9]+, 0x1020, lsl 16
      75  **	movk	x[0-9]+, 0x408, lsl 32
      76  **	movk	x[0-9]+, 0x102, lsl 48
      77  **	fmov	d[0-9]+, x[0-9]+
      78  **	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
      79  **	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
      80  **	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
      81  **	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
      82  **	umov	w[0-9]+, v[0-9]+.h\[3\]
      83  **	ret
      84  */
      85  
      86  uint64_t
      87  test4 (uint8x16_t input)
      88  {
      89      uint8x16_t bool_input = vshrq_n_u8(input, 7);
      90      poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
      91      poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
      92                                 vgetq_lane_p64(mask, 0));
      93      poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
      94      uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
      95      return vget_lane_u16((uint16x4_t)res, 3);
      96  }
      97