2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
14 #define __builtin_prefetch(x)
17 unsigned int vp8_variance16x16_neon(
18 const unsigned char *src_ptr,
20 const unsigned char *ref_ptr,
24 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
25 uint32x2_t d0u32, d10u32;
26 int64x1_t d0s64, d1s64;
27 uint8x16_t q0u8, q1u8, q2u8, q3u8;
28 uint16x8_t q11u16, q12u16, q13u16, q14u16;
29 int32x4_t q8s32, q9s32, q10s32;
30 int64x2_t q0s64, q1s64, q5s64;
32 q8s32 = vdupq_n_s32(0);
33 q9s32 = vdupq_n_s32(0);
34 q10s32 = vdupq_n_s32(0);
36 for (i = 0; i < 8; i++) {
37 q0u8 = vld1q_u8(src_ptr);
38 src_ptr += source_stride;
39 q1u8 = vld1q_u8(src_ptr);
40 src_ptr += source_stride;
41 __builtin_prefetch(src_ptr);
43 q2u8 = vld1q_u8(ref_ptr);
44 ref_ptr += recon_stride;
45 q3u8 = vld1q_u8(ref_ptr);
46 ref_ptr += recon_stride;
47 __builtin_prefetch(ref_ptr);
49 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
50 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
51 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
52 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
54 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
55 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
56 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
57 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
58 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
60 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
61 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
62 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
63 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
64 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
66 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
67 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
68 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
69 q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
70 q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
72 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
73 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
74 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
75 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
76 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
79 q10s32 = vaddq_s32(q10s32, q9s32);
80 q0s64 = vpaddlq_s32(q8s32);
81 q1s64 = vpaddlq_s32(q10s32);
83 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
84 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
86 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
87 vreinterpret_s32_s64(d0s64));
88 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
90 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
91 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
93 return vget_lane_u32(d0u32, 0);
96 unsigned int vp8_variance16x8_neon(
97 const unsigned char *src_ptr,
99 const unsigned char *ref_ptr,
103 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
104 uint32x2_t d0u32, d10u32;
105 int64x1_t d0s64, d1s64;
106 uint8x16_t q0u8, q1u8, q2u8, q3u8;
107 uint16x8_t q11u16, q12u16, q13u16, q14u16;
108 int32x4_t q8s32, q9s32, q10s32;
109 int64x2_t q0s64, q1s64, q5s64;
111 q8s32 = vdupq_n_s32(0);
112 q9s32 = vdupq_n_s32(0);
113 q10s32 = vdupq_n_s32(0);
115 for (i = 0; i < 4; i++) { // variance16x8_neon_loop
116 q0u8 = vld1q_u8(src_ptr);
117 src_ptr += source_stride;
118 q1u8 = vld1q_u8(src_ptr);
119 src_ptr += source_stride;
120 __builtin_prefetch(src_ptr);
122 q2u8 = vld1q_u8(ref_ptr);
123 ref_ptr += recon_stride;
124 q3u8 = vld1q_u8(ref_ptr);
125 ref_ptr += recon_stride;
126 __builtin_prefetch(ref_ptr);
128 q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
129 q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
130 q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
131 q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
133 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
134 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
135 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
136 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
137 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
139 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
140 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
141 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
142 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
143 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
145 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
146 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
147 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
148 q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
149 q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
151 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
152 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
153 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
154 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
155 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
158 q10s32 = vaddq_s32(q10s32, q9s32);
159 q0s64 = vpaddlq_s32(q8s32);
160 q1s64 = vpaddlq_s32(q10s32);
162 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
163 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
165 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
166 vreinterpret_s32_s64(d0s64));
167 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
169 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
170 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
172 return vget_lane_u32(d0u32, 0);
175 unsigned int vp8_variance8x16_neon(
176 const unsigned char *src_ptr,
178 const unsigned char *ref_ptr,
182 uint8x8_t d0u8, d2u8, d4u8, d6u8;
183 int16x4_t d22s16, d23s16, d24s16, d25s16;
184 uint32x2_t d0u32, d10u32;
185 int64x1_t d0s64, d1s64;
186 uint16x8_t q11u16, q12u16;
187 int32x4_t q8s32, q9s32, q10s32;
188 int64x2_t q0s64, q1s64, q5s64;
190 q8s32 = vdupq_n_s32(0);
191 q9s32 = vdupq_n_s32(0);
192 q10s32 = vdupq_n_s32(0);
194 for (i = 0; i < 8; i++) { // variance8x16_neon_loop
195 d0u8 = vld1_u8(src_ptr);
196 src_ptr += source_stride;
197 d2u8 = vld1_u8(src_ptr);
198 src_ptr += source_stride;
199 __builtin_prefetch(src_ptr);
201 d4u8 = vld1_u8(ref_ptr);
202 ref_ptr += recon_stride;
203 d6u8 = vld1_u8(ref_ptr);
204 ref_ptr += recon_stride;
205 __builtin_prefetch(ref_ptr);
207 q11u16 = vsubl_u8(d0u8, d4u8);
208 q12u16 = vsubl_u8(d2u8, d6u8);
210 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
211 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
212 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
213 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
214 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
216 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
217 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
218 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
219 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
220 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
223 q10s32 = vaddq_s32(q10s32, q9s32);
224 q0s64 = vpaddlq_s32(q8s32);
225 q1s64 = vpaddlq_s32(q10s32);
227 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
228 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
230 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
231 vreinterpret_s32_s64(d0s64));
232 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
234 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
235 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
237 return vget_lane_u32(d0u32, 0);
240 unsigned int vp8_variance8x8_neon(
241 const unsigned char *src_ptr,
243 const unsigned char *ref_ptr,
247 uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
248 int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
249 uint32x2_t d0u32, d10u32;
250 int64x1_t d0s64, d1s64;
251 uint16x8_t q11u16, q12u16, q13u16, q14u16;
252 int32x4_t q8s32, q9s32, q10s32;
253 int64x2_t q0s64, q1s64, q5s64;
255 q8s32 = vdupq_n_s32(0);
256 q9s32 = vdupq_n_s32(0);
257 q10s32 = vdupq_n_s32(0);
259 for (i = 0; i < 2; i++) { // variance8x8_neon_loop
260 d0u8 = vld1_u8(src_ptr);
261 src_ptr += source_stride;
262 d1u8 = vld1_u8(src_ptr);
263 src_ptr += source_stride;
264 d2u8 = vld1_u8(src_ptr);
265 src_ptr += source_stride;
266 d3u8 = vld1_u8(src_ptr);
267 src_ptr += source_stride;
269 d4u8 = vld1_u8(ref_ptr);
270 ref_ptr += recon_stride;
271 d5u8 = vld1_u8(ref_ptr);
272 ref_ptr += recon_stride;
273 d6u8 = vld1_u8(ref_ptr);
274 ref_ptr += recon_stride;
275 d7u8 = vld1_u8(ref_ptr);
276 ref_ptr += recon_stride;
278 q11u16 = vsubl_u8(d0u8, d4u8);
279 q12u16 = vsubl_u8(d1u8, d5u8);
280 q13u16 = vsubl_u8(d2u8, d6u8);
281 q14u16 = vsubl_u8(d3u8, d7u8);
283 d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
284 d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
285 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
286 q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
287 q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
289 d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
290 d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
291 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
292 q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
293 q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
295 d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
296 d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
297 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
298 q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
299 q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
301 d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
302 d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
303 q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
304 q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
305 q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
308 q10s32 = vaddq_s32(q10s32, q9s32);
309 q0s64 = vpaddlq_s32(q8s32);
310 q1s64 = vpaddlq_s32(q10s32);
312 d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
313 d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
315 q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
316 vreinterpret_s32_s64(d0s64));
317 vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
319 d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
320 d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
322 return vget_lane_u32(d0u32, 0);