1 /* { dg-options "-O" } */
2
3 #pragma GCC target "arch=armv8.2-a+dotprod"
4
5 #include <arm_neon.h>
6
7 static inline uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
8 return vaddvq_u32(a);
9 }
10
11 static inline unsigned int sadwxh_avg_neon(const uint8_t *src_ptr,
12 int src_stride,
13 const uint8_t *ref_ptr,
14 int ref_stride, int w, int h,
15 const uint8_t *second_pred) {
16
17
18 uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
19
20 int i = h;
21 do {
22 int j = 0;
23 do {
24 uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
25
26 s0 = vld1q_u8(src_ptr + j);
27 r0 = vld1q_u8(ref_ptr + j);
28 p0 = vld1q_u8(second_pred);
29 avg0 = vrhaddq_u8(r0, p0);
30 diff0 = vabdq_u8(s0, avg0);
31 sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
32
33 s1 = vld1q_u8(src_ptr + j + 16);
34 r1 = vld1q_u8(ref_ptr + j + 16);
35 p1 = vld1q_u8(second_pred + 16);
36 avg1 = vrhaddq_u8(r1, p1);
37 diff1 = vabdq_u8(s1, avg1);
38 sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
39
40 j += 32;
41 second_pred += 32;
42 } while (j < w);
43
44 src_ptr += src_stride;
45 ref_ptr += ref_stride;
46 } while (--i != 0);
47
48 return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
49 }
50
51 static inline unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
52 int src_stride,
53 const uint8_t *ref_ptr,
54 int ref_stride, int h,
55 const uint8_t *second_pred) {
56 return sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
57 second_pred);
58 }
59
60 uint32_t vpx_sad32x16_avg_neon(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred) { return sad32xh_avg_neon(src, src_stride, ref, ref_stride, (16), second_pred); }