2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_sixtap_predict8x4_armv6|
14 AREA |.text|, CODE, READONLY ; name this block of code
15 ;-------------------------------------
16 ; r0 unsigned char *src_ptr,
17 ; r1 int src_pixels_per_line,
20 ; stack unsigned char *dst_ptr,
22 ;-------------------------------------
23 ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
24 ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
25 ;and the result is stored in transpose.
26 |vp8_sixtap_predict8x4_armv6| PROC
27 stmdb sp!, {r4 - r11, lr}
28 str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
30 cmp r2, #0 ;skip first_pass filter if xoffset=0
31 add lr, sp, #4 ;point to temporary buffer
32 beq skip_firstpass_filter
35 ldr r12, _filter8_coeff_
36 sub r0, r0, r1, lsl #1
38 add r2, r12, r2, lsl #4 ;calculate filter location
39 add r0, r0, #3 ;adjust src only for loading convinience
41 ldr r3, [r2] ; load up packed filter coefficients
45 mov r2, #0x90000 ; height=9 is top part of counter
50 ldrb r6, [r0, #-5] ; load source data
56 orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
58 pkhbt r6, r6, r7, lsl #16 ; r7 | r6
59 pkhbt r7, r7, r8, lsl #16 ; r8 | r7
61 pkhbt r8, r8, r9, lsl #16 ; r9 | r8
62 pkhbt r9, r9, r10, lsl #16 ; r10 | r9
65 smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
70 smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
72 smlad r12, r9, r4, r12
74 pkhbt r10, r10, r6, lsl #16 ; r10 | r9
75 pkhbt r6, r6, r7, lsl #16 ; r11 | r10
76 smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
77 smlad r12, r6, r5, r12
81 add r11, r11, #0x40 ; round_shift_and_clamp
82 tst r2, #0xff ; test loop counter
83 usat r11, #8, r11, asr #7
85 strh r11, [lr], #20 ; result is transposed and stored, which
86 usat r12, #8, r12, asr #7
99 bne first_pass_wloop_v6
101 ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
107 subs r2, r2, #0x10000
111 add r0, r0, r1 ; move to next input line
113 bne first_pass_hloop_v6
117 ldr r3, [sp], #4 ; load back yoffset
118 ldr r0, [sp, #216] ; load dst address from stack 180+36
119 ldr r1, [sp, #220] ; load dst stride from stack 180+40
122 beq skip_secondpass_filter
124 ldr r12, _filter8_coeff_
125 add lr, r12, r3, lsl #4 ;calculate filter location
129 ldr r3, [lr] ; load up packed filter coefficients
133 pkhbt r12, r4, r3 ; pack the filter differently
137 ldr r6, [sp] ; load the data
140 orr r2, r2, #2 ; loop counter
143 smuad lr, r3, r6 ; apply filter
149 smladx r10, r12, r7, r10
154 smladx r10, r11, r8, r10
157 smlatb r10, r5, r9, r10
161 add lr, lr, #0x40 ; round_shift_and_clamp
163 usat lr, #8, lr, asr #7
165 strb lr, [r0], r1 ; the result is transposed back and stored
166 usat r10, #8, r10, asr #7
173 bne second_pass_wloop_v6
175 subs r2, r2, #0x10000
176 add sp, sp, #12 ; updata src for next loop (20-8)
177 sub r0, r0, r1, lsl #2
180 bne second_pass_hloop_v6
183 ldmia sp!, {r4 - r11, pc}
185 ;--------------------
186 skip_firstpass_filter
187 sub r0, r0, r1, lsl #1
192 ldrb r4, [r0], #1 ; load data
195 strh r4, [lr], #20 ; store it to immediate buffer
196 ldrb r6, [r0], #1 ; load data
208 add r0, r0, r1 ; move to next input line
211 sub lr, lr, #158 ; move over to next column
212 bne skip_firstpass_hloop
216 ;--------------------
217 skip_secondpass_filter
219 add sp, sp, #4 ;start from src[0] instead of src[-2]
221 skip_secondpass_hloop
226 mov r7, r6, lsr #16 ; unpack
230 add sp, sp, #12 ; 20-8
234 sub r0, r0, r1, lsl #2
237 bne skip_secondpass_hloop
239 add sp, sp, #16 ; 180 - (160 +4)
241 ldmia sp!, {r4 - r11, pc}
246 ;One word each is reserved. Label filter_coeff can be used to access the data.
247 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
251 DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
252 DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
253 DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
254 DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
255 DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
256 DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
257 DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
258 DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
260 ;DCD 0, 0, 128, 0, 0, 0
261 ;DCD 0, -6, 123, 12, -1, 0
262 ;DCD 2, -11, 108, 36, -8, 1
263 ;DCD 0, -9, 93, 50, -6, 0
264 ;DCD 3, -16, 77, 77, -16, 3
265 ;DCD 0, -6, 50, 93, -9, 0
266 ;DCD 1, -8, 36, 108, -11, 2
267 ;DCD 0, -1, 12, 123, -6, 0