2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_sub_pixel_variance8x8_neon|
17 AREA ||.text||, CODE, READONLY, ALIGN=2
18 ; r0 unsigned char *src_ptr,
19 ; r1 int src_pixels_per_line,
22 ; stack(r4) unsigned char *dst_ptr,
23 ; stack(r5) int dst_pixels_per_line,
24 ; stack(r6) unsigned int *sse
25 ;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
27 |vp8_sub_pixel_variance8x8_neon| PROC
31 adr r12, bilinear_taps_coeff
32 ldr r4, [sp, #76] ;load *dst_ptr from stack
33 ldr r5, [sp, #80] ;load dst_pixels_per_line from stack
34 ldr lr, [sp, #84] ;load *sse from stack
36 cmp r2, #0 ;skip first_pass filter if xoffset=0
37 beq skip_firstpass_filter
39 ;First pass: output_height lines x output_width columns (9x8)
40 add r2, r12, r2, lsl #3 ;calculate filter location
42 vld1.u8 {q1}, [r0], r1 ;load src data
43 vld1.u32 {d31}, [r2] ;load first_pass filter
44 vld1.u8 {q2}, [r0], r1
45 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
46 vld1.u8 {q3}, [r0], r1
48 vld1.u8 {q4}, [r0], r1
50 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
55 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
60 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
65 vld1.u8 {q1}, [r0], r1 ;load src data
66 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
67 vld1.u8 {q2}, [r0], r1
68 vqrshrn.u16 d23, q7, #7
69 vld1.u8 {q3}, [r0], r1
70 vqrshrn.u16 d24, q8, #7
71 vld1.u8 {q4}, [r0], r1
72 vqrshrn.u16 d25, q9, #7
74 ;first_pass filtering on the rest 5-line data
75 vld1.u8 {q5}, [r0], r1
77 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
83 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
87 vext.8 d11, d10, d11, #1
89 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
95 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
96 vqrshrn.u16 d27, q7, #7
97 vqrshrn.u16 d28, q8, #7
98 vqrshrn.u16 d29, q9, #7
99 vqrshrn.u16 d30, q10, #7
103 cmp r3, #0 ;skip second_pass filter if yoffset=0
104 ;skip_secondpass_filter
105 beq sub_pixel_variance8x8_neon
107 add r3, r12, r3, lsl #3
109 vld1.u32 {d31}, [r3] ;load second_pass filter
111 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
114 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
123 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
132 vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
133 vqrshrn.u16 d23, q2, #7
134 vqrshrn.u16 d24, q3, #7
135 vqrshrn.u16 d25, q4, #7
136 vqrshrn.u16 d26, q5, #7
137 vqrshrn.u16 d27, q6, #7
138 vqrshrn.u16 d28, q7, #7
139 vqrshrn.u16 d29, q8, #7
141 b sub_pixel_variance8x8_neon
143 ;--------------------
144 skip_firstpass_filter
145 vld1.u8 {d22}, [r0], r1 ;load src data
146 vld1.u8 {d23}, [r0], r1
147 vld1.u8 {d24}, [r0], r1
148 vld1.u8 {d25}, [r0], r1
149 vld1.u8 {d26}, [r0], r1
150 vld1.u8 {d27}, [r0], r1
151 vld1.u8 {d28}, [r0], r1
152 vld1.u8 {d29}, [r0], r1
153 vld1.u8 {d30}, [r0], r1
157 ;----------------------
158 ;vp8_variance8x8_neon
159 sub_pixel_variance8x8_neon
160 vmov.i8 q8, #0 ;q8 - sum
161 vmov.i8 q9, #0 ;q9, q10 - sse
166 sub_pixel_variance8x8_neon_loop
167 vld1.8 {d0}, [r4], r5 ;load dst data
169 vld1.8 {d1}, [r4], r5
170 vld1.8 {d2}, [r4], r5
171 vsubl.u8 q4, d22, d0 ;calculate diff
172 vld1.8 {d3}, [r4], r5
177 vpadal.s16 q8, q4 ;sum
178 vmlal.s16 q9, d8, d8 ;sse
179 vmlal.s16 q10, d9, d9
184 vmlal.s16 q9, d10, d10
185 vmlal.s16 q10, d11, d11
190 vmlal.s16 q9, d12, d12
191 vmlal.s16 q10, d13, d13
196 vmlal.s16 q9, d14, d14
197 vmlal.s16 q10, d15, d15
199 bne sub_pixel_variance8x8_neon_loop
201 vadd.u32 q10, q9, q10 ;accumulate sse
202 vpaddl.s32 q0, q8 ;accumulate sum
209 vst1.32 {d1[0]}, [lr] ;store sse
210 vshr.u32 d10, d10, #6
213 vmov.32 r0, d0[0] ;return
223 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112