2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_variance16x16_neon|
13 EXPORT |vp8_variance16x8_neon|
14 EXPORT |vp8_variance8x16_neon|
15 EXPORT |vp8_variance8x8_neon|
21 AREA ||.text||, CODE, READONLY, ALIGN=2
23 ; r0 unsigned char *src_ptr
24 ; r1 int source_stride
25 ; r2 unsigned char *ref_ptr
27 ; stack unsigned int *sse
28 |vp8_variance16x16_neon| PROC
29 vmov.i8 q8, #0 ;q8 - sum
30 vmov.i8 q9, #0 ;q9, q10 - sse
35 variance16x16_neon_loop
36 vld1.8 {q0}, [r0], r1 ;Load up source and reference
41 vsubl.u8 q11, d0, d4 ;calculate diff
46 ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
47 ;the results into the elements of the destination vector. The explanation
48 ;in ARM guide is wrong.
49 vpadal.s16 q8, q11 ;calculate sum
50 vmlal.s16 q9, d22, d22 ;calculate sse
51 vmlal.s16 q10, d23, d23
56 vmlal.s16 q9, d24, d24
57 vmlal.s16 q10, d25, d25
59 vmlal.s16 q9, d26, d26
60 vmlal.s16 q10, d27, d27
62 vmlal.s16 q9, d28, d28
63 vmlal.s16 q10, d29, d29
65 bne variance16x16_neon_loop
67 vadd.u32 q10, q9, q10 ;accumulate sse
68 vpaddl.s32 q0, q8 ;accumulate sum
70 ldr r12, [sp] ;load *sse from stack
76 ;vmov.32 r0, d0[0] ;this instruction costs a lot
80 ;sub r0, r1, r0, lsr #8
82 ; while sum is signed, sum * sum is always positive and must be treated as
83 ; unsigned to avoid propagating the sign bit.
85 vst1.32 {d1[0]}, [r12] ;store sse
89 vmov.32 r0, d0[0] ;return
94 ;================================
95 ;unsigned int vp8_variance16x8_c(
96 ; unsigned char *src_ptr,
98 ; unsigned char *ref_ptr,
101 |vp8_variance16x8_neon| PROC
102 vmov.i8 q8, #0 ;q8 - sum
103 vmov.i8 q9, #0 ;q9, q10 - sse
108 variance16x8_neon_loop
109 vld1.8 {q0}, [r0], r1 ;Load up source and reference
110 vld1.8 {q2}, [r2], r3
111 vld1.8 {q1}, [r0], r1
112 vld1.8 {q3}, [r2], r3
114 vsubl.u8 q11, d0, d4 ;calculate diff
119 vpadal.s16 q8, q11 ;calculate sum
120 vmlal.s16 q9, d22, d22 ;calculate sse
121 vmlal.s16 q10, d23, d23
126 vmlal.s16 q9, d24, d24
127 vmlal.s16 q10, d25, d25
129 vmlal.s16 q9, d26, d26
130 vmlal.s16 q10, d27, d27
132 vmlal.s16 q9, d28, d28
133 vmlal.s16 q10, d29, d29
135 bne variance16x8_neon_loop
137 vadd.u32 q10, q9, q10 ;accumulate sse
138 vpaddl.s32 q0, q8 ;accumulate sum
140 ldr r12, [sp] ;load *sse from stack
147 vst1.32 {d1[0]}, [r12] ;store sse
148 vshr.u32 d10, d10, #7
151 vmov.32 r0, d0[0] ;return
156 ;=================================
157 ;unsigned int vp8_variance8x16_c(
158 ; unsigned char *src_ptr,
160 ; unsigned char *ref_ptr,
164 |vp8_variance8x16_neon| PROC
165 vmov.i8 q8, #0 ;q8 - sum
166 vmov.i8 q9, #0 ;q9, q10 - sse
171 variance8x16_neon_loop
172 vld1.8 {d0}, [r0], r1 ;Load up source and reference
173 vld1.8 {d4}, [r2], r3
174 vld1.8 {d2}, [r0], r1
175 vld1.8 {d6}, [r2], r3
177 vsubl.u8 q11, d0, d4 ;calculate diff
180 vpadal.s16 q8, q11 ;calculate sum
181 vmlal.s16 q9, d22, d22 ;calculate sse
182 vmlal.s16 q10, d23, d23
187 vmlal.s16 q9, d24, d24
188 vmlal.s16 q10, d25, d25
190 bne variance8x16_neon_loop
192 vadd.u32 q10, q9, q10 ;accumulate sse
193 vpaddl.s32 q0, q8 ;accumulate sum
195 ldr r12, [sp] ;load *sse from stack
202 vst1.32 {d1[0]}, [r12] ;store sse
203 vshr.u32 d10, d10, #7
206 vmov.32 r0, d0[0] ;return
211 ;==================================
212 ; r0 unsigned char *src_ptr
213 ; r1 int source_stride
214 ; r2 unsigned char *ref_ptr
215 ; r3 int recon_stride
216 ; stack unsigned int *sse
217 |vp8_variance8x8_neon| PROC
218 vmov.i8 q8, #0 ;q8 - sum
219 vmov.i8 q9, #0 ;q9, q10 - sse
224 variance8x8_neon_loop
225 vld1.8 {d0}, [r0], r1 ;Load up source and reference
226 vld1.8 {d4}, [r2], r3
227 vld1.8 {d1}, [r0], r1
228 vld1.8 {d5}, [r2], r3
229 vld1.8 {d2}, [r0], r1
230 vld1.8 {d6}, [r2], r3
231 vld1.8 {d3}, [r0], r1
232 vld1.8 {d7}, [r2], r3
234 vsubl.u8 q11, d0, d4 ;calculate diff
239 vpadal.s16 q8, q11 ;calculate sum
240 vmlal.s16 q9, d22, d22 ;calculate sse
241 vmlal.s16 q10, d23, d23
246 vmlal.s16 q9, d24, d24
247 vmlal.s16 q10, d25, d25
249 vmlal.s16 q9, d26, d26
250 vmlal.s16 q10, d27, d27
252 vmlal.s16 q9, d28, d28
253 vmlal.s16 q10, d29, d29
255 bne variance8x8_neon_loop
257 vadd.u32 q10, q9, q10 ;accumulate sse
258 vpaddl.s32 q0, q8 ;accumulate sum
260 ldr r12, [sp] ;load *sse from stack
267 vst1.32 {d1[0]}, [r12] ;store sse
268 vshr.u32 d10, d10, #6
271 vmov.32 r0, d0[0] ;return