2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 EXPORT |vp8_variance16x16_neon|
12 EXPORT |vp8_variance16x8_neon|
13 EXPORT |vp8_variance8x16_neon|
14 EXPORT |vp8_variance8x8_neon|
20 AREA ||.text||, CODE, READONLY, ALIGN=2
22 ; r0 unsigned char *src_ptr
23 ; r1 int source_stride
24 ; r2 unsigned char *ref_ptr
26 ; stack unsigned int *sse
27 |vp8_variance16x16_neon| PROC
28 vmov.i8 q8, #0 ;q8 - sum
29 vmov.i8 q9, #0 ;q9, q10 - sse
34 variance16x16_neon_loop
35 vld1.8 {q0}, [r0], r1 ;Load up source and reference
40 vsubl.u8 q11, d0, d4 ;calculate diff
45 ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
46 ;the results into the elements of the destination vector. The explanation
47 ;in ARM guide is wrong.
48 vpadal.s16 q8, q11 ;calculate sum
49 vmlal.s16 q9, d22, d22 ;calculate sse
50 vmlal.s16 q10, d23, d23
55 vmlal.s16 q9, d24, d24
56 vmlal.s16 q10, d25, d25
58 vmlal.s16 q9, d26, d26
59 vmlal.s16 q10, d27, d27
61 vmlal.s16 q9, d28, d28
62 vmlal.s16 q10, d29, d29
64 bne variance16x16_neon_loop
66 vadd.u32 q10, q9, q10 ;accumulate sse
67 vpaddl.s32 q0, q8 ;accumulate sum
69 ldr r12, [sp] ;load *sse from stack
75 ;vmov.32 r0, d0[0] ;this instruction costs a lot
79 ;sub r0, r1, r0, asr #8
81 ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
82 ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
84 vst1.32 {d1[0]}, [r12] ;store sse
88 vmov.32 r0, d0[0] ;return
93 ;================================
94 ;unsigned int vp8_variance16x8_c(
95 ; unsigned char *src_ptr,
97 ; unsigned char *ref_ptr,
100 |vp8_variance16x8_neon| PROC
101 vmov.i8 q8, #0 ;q8 - sum
102 vmov.i8 q9, #0 ;q9, q10 - sse
107 variance16x8_neon_loop
108 vld1.8 {q0}, [r0], r1 ;Load up source and reference
109 vld1.8 {q2}, [r2], r3
110 vld1.8 {q1}, [r0], r1
111 vld1.8 {q3}, [r2], r3
113 vsubl.u8 q11, d0, d4 ;calculate diff
118 vpadal.s16 q8, q11 ;calculate sum
119 vmlal.s16 q9, d22, d22 ;calculate sse
120 vmlal.s16 q10, d23, d23
125 vmlal.s16 q9, d24, d24
126 vmlal.s16 q10, d25, d25
128 vmlal.s16 q9, d26, d26
129 vmlal.s16 q10, d27, d27
131 vmlal.s16 q9, d28, d28
132 vmlal.s16 q10, d29, d29
134 bne variance16x8_neon_loop
136 vadd.u32 q10, q9, q10 ;accumulate sse
137 vpaddl.s32 q0, q8 ;accumulate sum
139 ldr r12, [sp] ;load *sse from stack
146 vst1.32 {d1[0]}, [r12] ;store sse
147 vshr.s32 d10, d10, #7
150 vmov.32 r0, d0[0] ;return
155 ;=================================
156 ;unsigned int vp8_variance8x16_c(
157 ; unsigned char *src_ptr,
159 ; unsigned char *ref_ptr,
163 |vp8_variance8x16_neon| PROC
164 vmov.i8 q8, #0 ;q8 - sum
165 vmov.i8 q9, #0 ;q9, q10 - sse
170 variance8x16_neon_loop
171 vld1.8 {d0}, [r0], r1 ;Load up source and reference
172 vld1.8 {d4}, [r2], r3
173 vld1.8 {d2}, [r0], r1
174 vld1.8 {d6}, [r2], r3
176 vsubl.u8 q11, d0, d4 ;calculate diff
179 vpadal.s16 q8, q11 ;calculate sum
180 vmlal.s16 q9, d22, d22 ;calculate sse
181 vmlal.s16 q10, d23, d23
186 vmlal.s16 q9, d24, d24
187 vmlal.s16 q10, d25, d25
189 bne variance8x16_neon_loop
191 vadd.u32 q10, q9, q10 ;accumulate sse
192 vpaddl.s32 q0, q8 ;accumulate sum
194 ldr r12, [sp] ;load *sse from stack
201 vst1.32 {d1[0]}, [r12] ;store sse
202 vshr.s32 d10, d10, #7
205 vmov.32 r0, d0[0] ;return
210 ;==================================
211 ; r0 unsigned char *src_ptr
212 ; r1 int source_stride
213 ; r2 unsigned char *ref_ptr
214 ; r3 int recon_stride
215 ; stack unsigned int *sse
216 |vp8_variance8x8_neon| PROC
217 vmov.i8 q8, #0 ;q8 - sum
218 vmov.i8 q9, #0 ;q9, q10 - sse
223 variance8x8_neon_loop
224 vld1.8 {d0}, [r0], r1 ;Load up source and reference
225 vld1.8 {d4}, [r2], r3
226 vld1.8 {d1}, [r0], r1
227 vld1.8 {d5}, [r2], r3
228 vld1.8 {d2}, [r0], r1
229 vld1.8 {d6}, [r2], r3
230 vld1.8 {d3}, [r0], r1
231 vld1.8 {d7}, [r2], r3
233 vsubl.u8 q11, d0, d4 ;calculate diff
238 vpadal.s16 q8, q11 ;calculate sum
239 vmlal.s16 q9, d22, d22 ;calculate sse
240 vmlal.s16 q10, d23, d23
245 vmlal.s16 q9, d24, d24
246 vmlal.s16 q10, d25, d25
248 vmlal.s16 q9, d26, d26
249 vmlal.s16 q10, d27, d27
251 vmlal.s16 q9, d28, d28
252 vmlal.s16 q10, d29, d29
254 bne variance8x8_neon_loop
256 vadd.u32 q10, q9, q10 ;accumulate sse
257 vpaddl.s32 q0, q8 ;accumulate sum
259 ldr r12, [sp] ;load *sse from stack
266 vst1.32 {d1[0]}, [r12] ;store sse
267 vshr.s32 d10, d10, #6
270 vmov.32 r0, d0[0] ;return