2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_sub_pixel_variance8x8_neon|
17 AREA ||.text||, CODE, READONLY, ALIGN=2
18 ; r0 unsigned char *src_ptr,
19 ; r1 int src_pixels_per_line,
22 ; stack(r4) unsigned char *dst_ptr,
23 ; stack(r5) int dst_pixels_per_line,
24 ; stack(r6) unsigned int *sse
25 ;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
27 |vp8_sub_pixel_variance8x8_neon| PROC
30 adr r12, bilinear_taps_coeff
31 ldr r4, [sp, #12] ;load *dst_ptr from stack
32 ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
33 ldr lr, [sp, #20] ;load *sse from stack
35 cmp r2, #0 ;skip first_pass filter if xoffset=0
36 beq skip_firstpass_filter
38 ;First pass: output_height lines x output_width columns (9x8)
39 add r2, r12, r2, lsl #3 ;calculate filter location
41 vld1.u8 {q1}, [r0], r1 ;load src data
42 vld1.u32 {d31}, [r2] ;load first_pass filter
43 vld1.u8 {q2}, [r0], r1
44 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
45 vld1.u8 {q3}, [r0], r1
47 vld1.u8 {q4}, [r0], r1
49 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
54 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
59 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
64 vld1.u8 {q1}, [r0], r1 ;load src data
65 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
66 vld1.u8 {q2}, [r0], r1
67 vqrshrn.u16 d23, q7, #7
68 vld1.u8 {q3}, [r0], r1
69 vqrshrn.u16 d24, q8, #7
70 vld1.u8 {q4}, [r0], r1
71 vqrshrn.u16 d25, q9, #7
73 ;first_pass filtering on the rest 5-line data
74 vld1.u8 {q5}, [r0], r1
76 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
82 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
86 vext.8 d11, d10, d11, #1
88 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
94 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
95 vqrshrn.u16 d27, q7, #7
96 vqrshrn.u16 d28, q8, #7
97 vqrshrn.u16 d29, q9, #7
98 vqrshrn.u16 d30, q10, #7
102 cmp r3, #0 ;skip second_pass filter if yoffset=0
103 ;skip_secondpass_filter
104 beq sub_pixel_variance8x8_neon
106 add r3, r12, r3, lsl #3
108 vld1.u32 {d31}, [r3] ;load second_pass filter
110 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
113 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
122 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
131 vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
132 vqrshrn.u16 d23, q2, #7
133 vqrshrn.u16 d24, q3, #7
134 vqrshrn.u16 d25, q4, #7
135 vqrshrn.u16 d26, q5, #7
136 vqrshrn.u16 d27, q6, #7
137 vqrshrn.u16 d28, q7, #7
138 vqrshrn.u16 d29, q8, #7
140 b sub_pixel_variance8x8_neon
142 ;--------------------
143 skip_firstpass_filter
144 vld1.u8 {d22}, [r0], r1 ;load src data
145 vld1.u8 {d23}, [r0], r1
146 vld1.u8 {d24}, [r0], r1
147 vld1.u8 {d25}, [r0], r1
148 vld1.u8 {d26}, [r0], r1
149 vld1.u8 {d27}, [r0], r1
150 vld1.u8 {d28}, [r0], r1
151 vld1.u8 {d29}, [r0], r1
152 vld1.u8 {d30}, [r0], r1
156 ;----------------------
157 ;vp8_variance8x8_neon
158 sub_pixel_variance8x8_neon
159 vmov.i8 q8, #0 ;q8 - sum
160 vmov.i8 q9, #0 ;q9, q10 - sse
165 sub_pixel_variance8x8_neon_loop
166 vld1.8 {d0}, [r4], r5 ;load dst data
168 vld1.8 {d1}, [r4], r5
169 vld1.8 {d2}, [r4], r5
170 vsubl.u8 q4, d22, d0 ;calculate diff
171 vld1.8 {d3}, [r4], r5
176 vpadal.s16 q8, q4 ;sum
177 vmlal.s16 q9, d8, d8 ;sse
178 vmlal.s16 q10, d9, d9
183 vmlal.s16 q9, d10, d10
184 vmlal.s16 q10, d11, d11
189 vmlal.s16 q9, d12, d12
190 vmlal.s16 q10, d13, d13
195 vmlal.s16 q9, d14, d14
196 vmlal.s16 q10, d15, d15
198 bne sub_pixel_variance8x8_neon_loop
200 vadd.u32 q10, q9, q10 ;accumulate sse
201 vpaddl.s32 q0, q8 ;accumulate sum
208 vst1.32 {d1[0]}, [lr] ;store sse
209 vshr.u32 d10, d10, #6
212 vmov.32 r0, d0[0] ;return
220 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112