2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 EXPORT |vp8_sub_pixel_variance8x8_neon|
16 AREA ||.text||, CODE, READONLY, ALIGN=2
17 ; r0 unsigned char *src_ptr,
18 ; r1 int src_pixels_per_line,
21 ; stack(r4) unsigned char *dst_ptr,
22 ; stack(r5) int dst_pixels_per_line,
23 ; stack(r6) unsigned int *sse
24 ;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
26 |vp8_sub_pixel_variance8x8_neon| PROC
29 ldr r12, _BilinearTaps_coeff_
30 ldr r4, [sp, #12] ;load *dst_ptr from stack
31 ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
32 ldr lr, [sp, #20] ;load *sse from stack
34 cmp r2, #0 ;skip first_pass filter if xoffset=0
35 beq skip_firstpass_filter
37 ;First pass: output_height lines x output_width columns (9x8)
38 add r2, r12, r2, lsl #3 ;calculate filter location
40 vld1.u8 {q1}, [r0], r1 ;load src data
41 vld1.u32 {d31}, [r2] ;load first_pass filter
42 vld1.u8 {q2}, [r0], r1
43 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
44 vld1.u8 {q3}, [r0], r1
46 vld1.u8 {q4}, [r0], r1
48 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
53 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
58 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
63 vld1.u8 {q1}, [r0], r1 ;load src data
64 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
65 vld1.u8 {q2}, [r0], r1
66 vqrshrn.u16 d23, q7, #7
67 vld1.u8 {q3}, [r0], r1
68 vqrshrn.u16 d24, q8, #7
69 vld1.u8 {q4}, [r0], r1
70 vqrshrn.u16 d25, q9, #7
72 ;first_pass filtering on the rest 5-line data
73 vld1.u8 {q5}, [r0], r1
75 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
81 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
85 vext.8 d11, d10, d11, #1
87 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
93 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
94 vqrshrn.u16 d27, q7, #7
95 vqrshrn.u16 d28, q8, #7
96 vqrshrn.u16 d29, q9, #7
97 vqrshrn.u16 d30, q10, #7
101 cmp r3, #0 ;skip second_pass filter if yoffset=0
102 ;skip_secondpass_filter
103 beq sub_pixel_variance8x8_neon
105 add r3, r12, r3, lsl #3
107 vld1.u32 {d31}, [r3] ;load second_pass filter
109 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
112 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
121 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
130 vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
131 vqrshrn.u16 d23, q2, #7
132 vqrshrn.u16 d24, q3, #7
133 vqrshrn.u16 d25, q4, #7
134 vqrshrn.u16 d26, q5, #7
135 vqrshrn.u16 d27, q6, #7
136 vqrshrn.u16 d28, q7, #7
137 vqrshrn.u16 d29, q8, #7
139 b sub_pixel_variance8x8_neon
141 ;--------------------
142 skip_firstpass_filter
143 vld1.u8 {d22}, [r0], r1 ;load src data
144 vld1.u8 {d23}, [r0], r1
145 vld1.u8 {d24}, [r0], r1
146 vld1.u8 {d25}, [r0], r1
147 vld1.u8 {d26}, [r0], r1
148 vld1.u8 {d27}, [r0], r1
149 vld1.u8 {d28}, [r0], r1
150 vld1.u8 {d29}, [r0], r1
151 vld1.u8 {d30}, [r0], r1
155 ;----------------------
156 ;vp8_variance8x8_neon
157 sub_pixel_variance8x8_neon
158 vmov.i8 q8, #0 ;q8 - sum
159 vmov.i8 q9, #0 ;q9, q10 - sse
164 sub_pixel_variance8x8_neon_loop
165 vld1.8 {d0}, [r4], r5 ;load dst data
167 vld1.8 {d1}, [r4], r5
168 vld1.8 {d2}, [r4], r5
169 vsubl.u8 q4, d22, d0 ;calculate diff
170 vld1.8 {d3}, [r4], r5
175 vpadal.s16 q8, q4 ;sum
176 vmlal.s16 q9, d8, d8 ;sse
177 vmlal.s16 q10, d9, d9
182 vmlal.s16 q9, d10, d10
183 vmlal.s16 q10, d11, d11
188 vmlal.s16 q9, d12, d12
189 vmlal.s16 q10, d13, d13
194 vmlal.s16 q9, d14, d14
195 vmlal.s16 q10, d15, d15
197 bne sub_pixel_variance8x8_neon_loop
199 vadd.u32 q10, q9, q10 ;accumulate sse
200 vpaddl.s32 q0, q8 ;accumulate sum
207 vst1.32 {d1[0]}, [lr] ;store sse
208 vshr.s32 d10, d10, #6
211 vmov.32 r0, d0[0] ;return
217 AREA bilinear_taps_dat, DATA, READWRITE ;read/write by default
218 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
219 ;One word each is reserved. Label filter_coeff can be used to access the data.
220 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
222 DCD bilinear_taps_coeff
224 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112