1 /* { dg-do compile { target { lp64 } } } */
2 /* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns -fno-schedule-insns2 -mcmodel=small" } */
3 /* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
4
5 #include <arm_neon.h>
6
7 /*
8 **test1:
9 ** adrp x[0-9]+, .LC[0-9]+
10 ** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
11 ** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
12 ** str q[0-9]+, \[x[0-9]+\]
13 ** fmov x[0-9]+, d[0-9]+
14 ** orr x[0-9]+, x[0-9]+, x[0-9]+
15 ** ret
16 */
17
18 uint64_t
19 test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
20 {
21 uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL};
22 uint64_t res = a | arr[0];
23 uint64x2_t val = vld1q_u64 (arr);
24 *rt = vaddq_u64 (val, b);
25 return res;
26 }
27
28 /*
29 **test2:
30 ** adrp x[0-9]+, .LC[0-1]+
31 ** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
32 ** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
33 ** str q[0-9]+, \[x[0-9]+\]
34 ** fmov x[0-9]+, d[0-9]+
35 ** orr x[0-9]+, x[0-9]+, x[0-9]+
36 ** ret
37 */
38
39 uint64_t
40 test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
41 {
42 uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL);
43 uint64_t arr = vgetq_lane_u64 (val, 0);
44 uint64_t res = a | arr;
45 *rt = vaddq_u64 (val, b);
46 return res;
47 }
48
49 /*
50 **test3:
51 ** adrp x[0-9]+, .LC[0-9]+
52 ** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
53 ** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
54 ** str q[0-9]+, \[x1\]
55 ** fmov w[0-9]+, s[0-9]+
56 ** orr w[0-9]+, w[0-9]+, w[0-9]+
57 ** ret
58 */
59
60 uint32_t
61 test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
62 {
63 uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 };
64 uint32_t res = a | arr[0];
65 uint32x4_t val = vld1q_u32 (arr);
66 *rt = vaddq_u32 (val, b);
67 return res;
68 }
69
70 /*
71 **test4:
72 ** ushr v[0-9]+.16b, v[0-9]+.16b, 7
73 ** mov x[0-9]+, 16512
74 ** movk x[0-9]+, 0x1020, lsl 16
75 ** movk x[0-9]+, 0x408, lsl 32
76 ** movk x[0-9]+, 0x102, lsl 48
77 ** fmov d[0-9]+, x[0-9]+
78 ** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
79 ** dup v[0-9]+.2d, v[0-9]+.d\[0\]
80 ** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
81 ** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
82 ** umov w[0-9]+, v[0-9]+.h\[3\]
83 ** ret
84 */
85
86 uint64_t
87 test4 (uint8x16_t input)
88 {
89 uint8x16_t bool_input = vshrq_n_u8(input, 7);
90 poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL);
91 poly64_t prodL = vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0),
92 vgetq_lane_p64(mask, 0));
93 poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask);
94 uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
95 return vget_lane_u16((uint16x4_t)res, 3);
96 }
97