Tizen 2.1 base
[sdk/emulator/qemu.git] / tizen / distrib / libav / libavcodec / arm / vp8dsp_neon.S
1 /**
2  * VP8 NEON optimisations
3  *
4  * Copyright (c) 2010 Rob Clark <rob@ti.com>
5  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6  *
7  * This file is part of Libav.
8  *
9  * Libav is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * Libav is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with Libav; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include "asm.S"
25
26 function ff_vp8_luma_dc_wht_neon, export=1
27         vld1.16         {q0-q1},  [r1,:128]
28         vmov.i16        q15, #0
29
30         vadd.i16        d4,  d0,  d3
31         vadd.i16        d6,  d1,  d2
32         vst1.16         {q15},    [r1,:128]!
33         vsub.i16        d7,  d1,  d2
34         vsub.i16        d5,  d0,  d3
35         vst1.16         {q15},    [r1,:128]
36         vadd.i16        q0,  q2,  q3
37         vsub.i16        q1,  q2,  q3
38
39         vmov.i16        q8, #3
40
41         vtrn.32         d0,  d2
42         vtrn.32         d1,  d3
43         vtrn.16         d0,  d1
44         vtrn.16         d2,  d3
45
46         vadd.i16        d0,  d0,  d16
47
48         vadd.i16        d4,  d0,  d3
49         vadd.i16        d6,  d1,  d2
50         vsub.i16        d7,  d1,  d2
51         vsub.i16        d5,  d0,  d3
52         vadd.i16        q0,  q2,  q3
53         vsub.i16        q1,  q2,  q3
54
55         vshr.s16        q0,  q0,  #3
56         vshr.s16        q1,  q1,  #3
57
58         mov             r3,  #32
59         vst1.16         {d0[0]},  [r0,:16], r3
60         vst1.16         {d1[0]},  [r0,:16], r3
61         vst1.16         {d2[0]},  [r0,:16], r3
62         vst1.16         {d3[0]},  [r0,:16], r3
63         vst1.16         {d0[1]},  [r0,:16], r3
64         vst1.16         {d1[1]},  [r0,:16], r3
65         vst1.16         {d2[1]},  [r0,:16], r3
66         vst1.16         {d3[1]},  [r0,:16], r3
67         vst1.16         {d0[2]},  [r0,:16], r3
68         vst1.16         {d1[2]},  [r0,:16], r3
69         vst1.16         {d2[2]},  [r0,:16], r3
70         vst1.16         {d3[2]},  [r0,:16], r3
71         vst1.16         {d0[3]},  [r0,:16], r3
72         vst1.16         {d1[3]},  [r0,:16], r3
73         vst1.16         {d2[3]},  [r0,:16], r3
74         vst1.16         {d3[3]},  [r0,:16], r3
75
76         bx              lr
77 endfunc
78
79 function ff_vp8_luma_dc_wht_dc_neon, export=1
80         ldrsh           r2,  [r1]
81         mov             r3,  #0
82         add             r2,  r2,  #3
83         strh            r3,  [r1]
84         asr             r2,  r2,  #3
85     .rept 16
86         strh            r2,  [r0], #32
87     .endr
88         bx              lr
89 endfunc
90
91 function ff_vp8_idct_add_neon, export=1
92         vld1.16         {q0-q1},  [r1,:128]
93         movw            r3,  #20091
94         movt            r3,  #35468/2
95         vdup.32         d4,  r3
96
97         vmull.s16       q12, d1,  d4[0]
98         vmull.s16       q13, d3,  d4[0]
99         vqdmulh.s16     d20, d1,  d4[1]
100         vqdmulh.s16     d23, d3,  d4[1]
101         vshrn.s32       d21, q12, #16
102         vshrn.s32       d22, q13, #16
103         vadd.s16        d21, d21, d1
104         vadd.s16        d22, d22, d3
105
106         vadd.s16        d16, d0,  d2
107         vsub.s16        d17, d0,  d2
108         vadd.s16        d18, d21, d23
109         vsub.s16        d19, d20, d22
110         vadd.s16        q0,  q8,  q9
111         vsub.s16        q1,  q8,  q9
112
113         vtrn.32         d0,  d3
114         vtrn.32         d1,  d2
115         vtrn.16         d0,  d1
116         vtrn.16         d3,  d2
117
118         vmov.i16        q15, #0
119         vmull.s16       q12, d1,  d4[0]
120         vst1.16         {q15},    [r1,:128]!
121         vmull.s16       q13, d2,  d4[0]
122         vst1.16         {q15},    [r1,:128]
123         vqdmulh.s16     d21, d1,  d4[1]
124         vqdmulh.s16     d23, d2,  d4[1]
125         vshrn.s32       d20, q12, #16
126         vshrn.s32       d22, q13, #16
127         vadd.i16        d20, d20, d1
128         vadd.i16        d22, d22, d2
129
130         vadd.i16        d16, d0,  d3
131         vsub.i16        d17, d0,  d3
132         vadd.i16        d18, d20, d23
133         vld1.32         {d20[]},  [r0,:32], r2
134         vsub.i16        d19, d21, d22
135         vld1.32         {d22[]},  [r0,:32], r2
136         vadd.s16        q0,  q8,  q9
137         vld1.32         {d23[]},  [r0,:32], r2
138         vsub.s16        q1,  q8,  q9
139         vld1.32         {d21[]},  [r0,:32], r2
140         vrshr.s16       q0,  q0,  #3
141         vtrn.32         q10, q11
142         vrshr.s16       q1,  q1,  #3
143
144         sub             r0,  r0,  r2,  lsl #2
145
146         vtrn.32         d0,  d3
147         vtrn.32         d1,  d2
148         vtrn.16         d0,  d1
149         vtrn.16         d3,  d2
150
151         vaddw.u8        q0,  q0,  d20
152         vaddw.u8        q1,  q1,  d21
153         vqmovun.s16     d0,  q0
154         vqmovun.s16     d1,  q1
155
156         vst1.32         {d0[0]},  [r0,:32], r2
157         vst1.32         {d0[1]},  [r0,:32], r2
158         vst1.32         {d1[1]},  [r0,:32], r2
159         vst1.32         {d1[0]},  [r0,:32], r2
160
161         bx              lr
162 endfunc
163
164 function ff_vp8_idct_dc_add_neon, export=1
165         mov             r3,  #0
166         ldrsh           r12, [r1]
167         strh            r3,  [r1]
168         vdup.16         q1,  r12
169         vrshr.s16       q1,  q1,  #3
170         vld1.32         {d0[]},   [r0,:32], r2
171         vld1.32         {d1[]},   [r0,:32], r2
172         vld1.32         {d0[1]},  [r0,:32], r2
173         vld1.32         {d1[1]},  [r0,:32], r2
174         vaddw.u8        q2,  q1,  d0
175         vaddw.u8        q3,  q1,  d1
176         sub             r0,  r0,  r2, lsl #2
177         vqmovun.s16     d0,  q2
178         vqmovun.s16     d1,  q3
179         vst1.32         {d0[0]},  [r0,:32], r2
180         vst1.32         {d1[0]},  [r0,:32], r2
181         vst1.32         {d0[1]},  [r0,:32], r2
182         vst1.32         {d1[1]},  [r0,:32], r2
183         bx              lr
184 endfunc
185
186 function ff_vp8_idct_dc_add4uv_neon, export=1
187         vmov.i16        d0,  #0
188         mov             r3,  #32
189         vld1.16         {d16[]},  [r1,:16]
190         vst1.16         {d0[0]},  [r1,:16], r3
191         vld1.16         {d17[]},  [r1,:16]
192         vst1.16         {d0[0]},  [r1,:16], r3
193         vld1.16         {d18[]},  [r1,:16]
194         vst1.16         {d0[0]},  [r1,:16], r3
195         vld1.16         {d19[]},  [r1,:16]
196         vst1.16         {d0[0]},  [r1,:16], r3
197         mov             r3,  r0
198         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
199         vld1.8          {d0},     [r0,:64], r2
200         vrshr.s16       q9,  q9,  #3
201         vld1.8          {d1},     [r0,:64], r2
202         vaddw.u8        q10, q8,  d0
203         vld1.8          {d2},     [r0,:64], r2
204         vaddw.u8        q0,  q8,  d1
205         vld1.8          {d3},     [r0,:64], r2
206         vaddw.u8        q11, q8,  d2
207         vld1.8          {d4},     [r0,:64], r2
208         vaddw.u8        q1,  q8,  d3
209         vld1.8          {d5},     [r0,:64], r2
210         vaddw.u8        q12, q9,  d4
211         vld1.8          {d6},     [r0,:64], r2
212         vaddw.u8        q2,  q9,  d5
213         vld1.8          {d7},     [r0,:64], r2
214         vaddw.u8        q13, q9,  d6
215         vqmovun.s16     d20, q10
216         vaddw.u8        q3,  q9,  d7
217         vqmovun.s16     d21, q0
218         vqmovun.s16     d22, q11
219         vst1.8          {d20},    [r3,:64], r2
220         vqmovun.s16     d23, q1
221         vst1.8          {d21},    [r3,:64], r2
222         vqmovun.s16     d24, q12
223         vst1.8          {d22},    [r3,:64], r2
224         vqmovun.s16     d25, q2
225         vst1.8          {d23},    [r3,:64], r2
226         vqmovun.s16     d26, q13
227         vst1.8          {d24},    [r3,:64], r2
228         vqmovun.s16     d27, q3
229         vst1.8          {d25},    [r3,:64], r2
230         vst1.8          {d26},    [r3,:64], r2
231         vst1.8          {d27},    [r3,:64], r2
232
233         bx              lr
234 endfunc
235
236 function ff_vp8_idct_dc_add4y_neon, export=1
237         vmov.i16        d0,  #0
238         mov             r3,  #32
239         vld1.16         {d16[]},  [r1,:16]
240         vst1.16         {d0[0]},  [r1,:16], r3
241         vld1.16         {d17[]},  [r1,:16]
242         vst1.16         {d0[0]},  [r1,:16], r3
243         vld1.16         {d18[]},  [r1,:16]
244         vst1.16         {d0[0]},  [r1,:16], r3
245         vld1.16         {d19[]},  [r1,:16]
246         vst1.16         {d0[0]},  [r1,:16], r3
247         vrshr.s16       q8,  q8,  #3            @ dc >>= 3
248         vld1.8          {q0},     [r0,:128], r2
249         vrshr.s16       q9,  q9,  #3
250         vld1.8          {q1},     [r0,:128], r2
251         vaddw.u8        q10, q8,  d0
252         vld1.8          {q2},     [r0,:128], r2
253         vaddw.u8        q0,  q9,  d1
254         vld1.8          {q3},     [r0,:128], r2
255         vaddw.u8        q11, q8,  d2
256         vaddw.u8        q1,  q9,  d3
257         vaddw.u8        q12, q8,  d4
258         vaddw.u8        q2,  q9,  d5
259         vaddw.u8        q13, q8,  d6
260         vaddw.u8        q3,  q9,  d7
261         sub             r0,  r0,  r2,  lsl #2
262         vqmovun.s16     d20, q10
263         vqmovun.s16     d21, q0
264         vqmovun.s16     d22, q11
265         vqmovun.s16     d23, q1
266         vqmovun.s16     d24, q12
267         vst1.8          {q10},    [r0,:128], r2
268         vqmovun.s16     d25, q2
269         vst1.8          {q11},    [r0,:128], r2
270         vqmovun.s16     d26, q13
271         vst1.8          {q12},    [r0,:128], r2
272         vqmovun.s16     d27, q3
273         vst1.8          {q13},    [r0,:128], r2
274
275         bx              lr
276 endfunc
277
278 @ Register layout:
279 @   P3..Q3 -> q0..q7
280 @   flim_E -> q14
281 @   flim_I -> q15
282 @   hev_thresh -> r12
283 @
284 .macro  vp8_loop_filter, inner=0, simple=0
285     .if \simple
286         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
287         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
288         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
289         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
290         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
291         vmov.i8         q13, #0x80
292         vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
293     .else
294         @ calculate hev and normal_limit:
295         vabd.u8         q12, q2,  q3            @ abs(P1-P0)
296         vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
297         vabd.u8         q10, q0,  q1            @ abs(P3-P2)
298         vabd.u8         q11, q1,  q2            @ abs(P2-P1)
299         vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
300         vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
301         vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
302         vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
303         vand            q8,  q8,  q9
304         vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
305         vand            q8,  q8,  q11
306         vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
307         vand            q8,  q8,  q10
308         vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
309         vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
310         vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
311         vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
312         vand            q8,  q8,  q10
313         vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
314         vand            q8,  q8,  q11
315         vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
316         vdup.8          q15, r12                @ hev_thresh
317         vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318         vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
319         vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320         vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
321         vand            q8,  q8,  q11
322         vmov.i8         q13, #0x80
323         vorr            q9,  q12, q14
324     .endif
325
326         @ at this point:
327         @   q8: normal_limit
328         @   q9: hev
329
330         @ convert to signed value:
331         veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
332         veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
333
334         vmov.i16        q12, #3
335         vsubl.s8        q10, d8,  d6            @ QS0 - PS0
336         vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
337         veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
338         veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
339         vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
340         vmul.i16        q11, q11, q12
341
342         vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
343         vmov.i8         q14, #4
344         vmov.i8         q15, #3
345     .if \inner
346         vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
347     .endif
348         vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
349         vaddw.s8        q11, q11, d25
350         vqmovn.s16      d20, q10                @ narrow result back into q10
351         vqmovn.s16      d21, q11
352     .if !\inner && !\simple
353         veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
354         veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
355     .endif
356         vand            q10, q10, q8            @ w &= normal_limit
357
358         @ registers used at this point..
359         @   q0 -> P3  (don't corrupt)
360         @   q1-q6 -> PS2-QS2
361         @   q7 -> Q3  (don't corrupt)
362         @   q9 -> hev
363         @   q10 -> w
364         @   q13 -> #0x80
365         @   q14 -> #4
366         @   q15 -> #3
367         @   q8, q11, q12 -> unused
368
369         @ filter_common:   is4tap==1
370         @   c1 = clamp(w + 4) >> 3;
371         @   c2 = clamp(w + 3) >> 3;
372         @   Q0 = s2u(QS0 - c1);
373         @   P0 = s2u(PS0 + c2);
374
375     .if \simple
376         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
377         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
378         vshr.s8         q11, q11, #3            @ c1 >>= 3
379         vshr.s8         q12, q12, #3            @ c2 >>= 3
380         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
381         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
382         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
383         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
384         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
385         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
386     .elseif \inner
387         @ the !is4tap case of filter_common, only used for inner blocks
388         @   c3 = ((c1&~hev) + 1) >> 1;
389         @   Q1 = s2u(QS1 - c3);
390         @   P1 = s2u(PS1 + c3);
391         vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
392         vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
393         vshr.s8         q11, q11, #3            @ c1 >>= 3
394         vshr.s8         q12, q12, #3            @ c2 >>= 3
395         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
396         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
397         vbic            q11, q11, q9            @ c1 & ~hev
398         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
399         vrshr.s8        q11, q11, #1            @ c3 >>= 1
400         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
401         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
402         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
403         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
404         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
405     .else
406         vand            q12, q10, q9            @ w & hev
407         vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
408         vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
409         vshr.s8         q11, q11, #3            @ c1 >>= 3
410         vshr.s8         q12, q12, #3            @ c2 >>= 3
411         vbic            q10, q10, q9            @ w &= ~hev
412         vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
413         vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
414
415         @ filter_mbedge:
416         @   a = clamp((27*w + 63) >> 7);
417         @   Q0 = s2u(QS0 - a);
418         @   P0 = s2u(PS0 + a);
419         @   a = clamp((18*w + 63) >> 7);
420         @   Q1 = s2u(QS1 - a);
421         @   P1 = s2u(PS1 + a);
422         @   a = clamp((9*w + 63) >> 7);
423         @   Q2 = s2u(QS2 - a);
424         @   P2 = s2u(PS2 + a);
425         vmov.i16        q9,  #63
426         vshll.s8        q14, d20, #3
427         vshll.s8        q15, d21, #3
428         vaddw.s8        q14, q14, d20
429         vaddw.s8        q15, q15, d21
430         vadd.s16        q8,  q9,  q14
431         vadd.s16        q9,  q9,  q15           @  9*w + 63
432         vadd.s16        q11, q8,  q14
433         vadd.s16        q12, q9,  q15           @ 18*w + 63
434         vadd.s16        q14, q11, q14
435         vadd.s16        q15, q12, q15           @ 27*w + 63
436         vqshrn.s16      d16, q8,  #7
437         vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
438         vqshrn.s16      d22, q11, #7
439         vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
440         vqshrn.s16      d28, q14, #7
441         vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
442         vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
443         vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
444         vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
445         vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
446         vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
447         vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
448         veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
449         veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
450         veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
451         veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
452         veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
453         veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
454     .endif
455 .endm
456
457 .macro transpose8x16matrix
458         vtrn.32         q0,   q4
459         vtrn.32         q1,   q5
460         vtrn.32         q2,   q6
461         vtrn.32         q3,   q7
462
463         vtrn.16         q0,   q2
464         vtrn.16         q1,   q3
465         vtrn.16         q4,   q6
466         vtrn.16         q5,   q7
467
468         vtrn.8          q0,   q1
469         vtrn.8          q2,   q3
470         vtrn.8          q4,   q5
471         vtrn.8          q6,   q7
472 .endm
473
474 .macro  vp8_v_loop_filter16 name, inner=0, simple=0
475 function ff_vp8_v_loop_filter16\name\()_neon, export=1
476         vpush           {q4-q7}
477         sub             r0,  r0,  r1,  lsl #1+!\simple
478
479         @ Load pixels:
480     .if !\simple
481         ldr             r12, [sp, #64]          @ hev_thresh
482         vld1.8          {q0},     [r0,:128], r1 @ P3
483         vld1.8          {q1},     [r0,:128], r1 @ P2
484     .endif
485         vld1.8          {q2},     [r0,:128], r1 @ P1
486         vld1.8          {q3},     [r0,:128], r1 @ P0
487         vld1.8          {q4},     [r0,:128], r1 @ Q0
488         vld1.8          {q5},     [r0,:128], r1 @ Q1
489     .if !\simple
490         vld1.8          {q6},     [r0,:128], r1 @ Q2
491         vld1.8          {q7},     [r0,:128]     @ Q3
492         vdup.8          q15, r3                 @ flim_I
493     .endif
494         vdup.8          q14, r2                 @ flim_E
495
496         vp8_loop_filter inner=\inner, simple=\simple
497
498         @ back up to P2:  dst -= stride * 6
499         sub             r0,  r0,  r1,  lsl #2
500     .if !\simple
501         sub             r0,  r0,  r1,  lsl #1
502
503         @ Store pixels:
504         vst1.8          {q1},     [r0,:128], r1 @ P2
505     .endif
506         vst1.8          {q2},     [r0,:128], r1 @ P1
507         vst1.8          {q3},     [r0,:128], r1 @ P0
508         vst1.8          {q4},     [r0,:128], r1 @ Q0
509         vst1.8          {q5},     [r0,:128], r1 @ Q1
510     .if !\simple
511         vst1.8          {q6},     [r0,:128]     @ Q2
512     .endif
513
514         vpop            {q4-q7}
515         bx              lr
516 endfunc
517 .endm
518
519 vp8_v_loop_filter16
520 vp8_v_loop_filter16 _inner,  inner=1
521 vp8_v_loop_filter16 _simple, simple=1
522
523 .macro  vp8_v_loop_filter8uv name, inner=0
524 function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
525         vpush           {q4-q7}
526         sub             r0,  r0,  r2,  lsl #2
527         sub             r1,  r1,  r2,  lsl #2
528         ldr             r12, [sp, #64]          @ flim_I
529
530         @ Load pixels:
531         vld1.8          {d0},     [r0,:64], r2  @ P3
532         vld1.8          {d1},     [r1,:64], r2  @ P3
533         vld1.8          {d2},     [r0,:64], r2  @ P2
534         vld1.8          {d3},     [r1,:64], r2  @ P2
535         vld1.8          {d4},     [r0,:64], r2  @ P1
536         vld1.8          {d5},     [r1,:64], r2  @ P1
537         vld1.8          {d6},     [r0,:64], r2  @ P0
538         vld1.8          {d7},     [r1,:64], r2  @ P0
539         vld1.8          {d8},     [r0,:64], r2  @ Q0
540         vld1.8          {d9},     [r1,:64], r2  @ Q0
541         vld1.8          {d10},    [r0,:64], r2  @ Q1
542         vld1.8          {d11},    [r1,:64], r2  @ Q1
543         vld1.8          {d12},    [r0,:64], r2  @ Q2
544         vld1.8          {d13},    [r1,:64], r2  @ Q2
545         vld1.8          {d14},    [r0,:64]      @ Q3
546         vld1.8          {d15},    [r1,:64]      @ Q3
547
548         vdup.8          q14, r3                 @ flim_E
549         vdup.8          q15, r12                @ flim_I
550         ldr             r12, [sp, #68]          @ hev_thresh
551
552         vp8_loop_filter inner=\inner
553
554         @ back up to P2:  u,v -= stride * 6
555         sub             r0,  r0,  r2,  lsl #2
556         sub             r1,  r1,  r2,  lsl #2
557         sub             r0,  r0,  r2,  lsl #1
558         sub             r1,  r1,  r2,  lsl #1
559
560         @ Store pixels:
561         vst1.8          {d2},     [r0,:64], r2  @ P2
562         vst1.8          {d3},     [r1,:64], r2  @ P2
563         vst1.8          {d4},     [r0,:64], r2  @ P1
564         vst1.8          {d5},     [r1,:64], r2  @ P1
565         vst1.8          {d6},     [r0,:64], r2  @ P0
566         vst1.8          {d7},     [r1,:64], r2  @ P0
567         vst1.8          {d8},     [r0,:64], r2  @ Q0
568         vst1.8          {d9},     [r1,:64], r2  @ Q0
569         vst1.8          {d10},    [r0,:64], r2  @ Q1
570         vst1.8          {d11},    [r1,:64], r2  @ Q1
571         vst1.8          {d12},    [r0,:64]      @ Q2
572         vst1.8          {d13},    [r1,:64]      @ Q2
573
574         vpop            {q4-q7}
575         bx              lr
576 endfunc
577 .endm
578
579 vp8_v_loop_filter8uv
580 vp8_v_loop_filter8uv _inner, inner=1
581
582 .macro  vp8_h_loop_filter16 name, inner=0, simple=0
583 function ff_vp8_h_loop_filter16\name\()_neon, export=1
584         vpush           {q4-q7}
585         sub             r0,  r0,  #4
586     .if !\simple
587         ldr             r12, [sp, #64]          @ hev_thresh
588     .endif
589
590         @ Load pixels:
591         vld1.8          {d0},     [r0], r1      @ load first 8-line src data
592         vld1.8          {d2},     [r0], r1
593         vld1.8          {d4},     [r0], r1
594         vld1.8          {d6},     [r0], r1
595         vld1.8          {d8},     [r0], r1
596         vld1.8          {d10},    [r0], r1
597         vld1.8          {d12},    [r0], r1
598         vld1.8          {d14},    [r0], r1
599         vld1.8          {d1},     [r0], r1      @ load second 8-line src data
600         vld1.8          {d3},     [r0], r1
601         vld1.8          {d5},     [r0], r1
602         vld1.8          {d7},     [r0], r1
603         vld1.8          {d9},     [r0], r1
604         vld1.8          {d11},    [r0], r1
605         vld1.8          {d13},    [r0], r1
606         vld1.8          {d15},    [r0], r1
607
608         transpose8x16matrix
609
610         vdup.8          q14, r2                 @ flim_E
611     .if !\simple
612         vdup.8          q15, r3                 @ flim_I
613     .endif
614
615         vp8_loop_filter inner=\inner, simple=\simple
616
617         sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
618
619         transpose8x16matrix
620
621         @ Store pixels:
622         vst1.8          {d0},     [r0],     r1
623         vst1.8          {d2},     [r0],     r1
624         vst1.8          {d4},     [r0],     r1
625         vst1.8          {d6},     [r0],     r1
626         vst1.8          {d8},     [r0],     r1
627         vst1.8          {d10},    [r0],     r1
628         vst1.8          {d12},    [r0],     r1
629         vst1.8          {d14},    [r0],     r1
630         vst1.8          {d1},     [r0],     r1
631         vst1.8          {d3},     [r0],     r1
632         vst1.8          {d5},     [r0],     r1
633         vst1.8          {d7},     [r0],     r1
634         vst1.8          {d9},     [r0],     r1
635         vst1.8          {d11},    [r0],     r1
636         vst1.8          {d13},    [r0],     r1
637         vst1.8          {d15},    [r0]
638
639         vpop            {q4-q7}
640         bx              lr
641 endfunc
642 .endm
643
644 vp8_h_loop_filter16
645 vp8_h_loop_filter16 _inner,  inner=1
646 vp8_h_loop_filter16 _simple, simple=1
647
648 .macro  vp8_h_loop_filter8uv name, inner=0
649 function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
650         vpush           {q4-q7}
651         sub             r0,  r0,  #4
652         sub             r1,  r1,  #4
653         ldr             r12, [sp, #64]          @ flim_I
654
655         @ Load pixels:
656         vld1.8          {d0},     [r0], r2      @ load u
657         vld1.8          {d1},     [r1], r2      @ load v
658         vld1.8          {d2},     [r0], r2
659         vld1.8          {d3},     [r1], r2
660         vld1.8          {d4},     [r0], r2
661         vld1.8          {d5},     [r1], r2
662         vld1.8          {d6},     [r0], r2
663         vld1.8          {d7},     [r1], r2
664         vld1.8          {d8},     [r0], r2
665         vld1.8          {d9},     [r1], r2
666         vld1.8          {d10},    [r0], r2
667         vld1.8          {d11},    [r1], r2
668         vld1.8          {d12},    [r0], r2
669         vld1.8          {d13},    [r1], r2
670         vld1.8          {d14},    [r0], r2
671         vld1.8          {d15},    [r1], r2
672
673         transpose8x16matrix
674
675         vdup.8          q14, r3                 @ flim_E
676         vdup.8          q15, r12                @ flim_I
677         ldr             r12, [sp, #68]          @ hev_thresh
678
679         vp8_loop_filter inner=\inner
680
681         sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
682         sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
683
684         transpose8x16matrix
685
686         @ Store pixels:
687         vst1.8          {d0},     [r0], r2
688         vst1.8          {d1},     [r1], r2
689         vst1.8          {d2},     [r0], r2
690         vst1.8          {d3},     [r1], r2
691         vst1.8          {d4},     [r0], r2
692         vst1.8          {d5},     [r1], r2
693         vst1.8          {d6},     [r0], r2
694         vst1.8          {d7},     [r1], r2
695         vst1.8          {d8},     [r0], r2
696         vst1.8          {d9},     [r1], r2
697         vst1.8          {d10},    [r0], r2
698         vst1.8          {d11},    [r1], r2
699         vst1.8          {d12},    [r0], r2
700         vst1.8          {d13},    [r1], r2
701         vst1.8          {d14},    [r0]
702         vst1.8          {d15},    [r1]
703
704         vpop            {q4-q7}
705         bx              lr
706 endfunc
707 .endm
708
709 vp8_h_loop_filter8uv
710 vp8_h_loop_filter8uv _inner, inner=1
711
712 function ff_put_vp8_pixels16_neon, export=1
713         ldr             r12, [sp, #0]           @ h
714 1:
715         subs            r12, r12, #4
716         vld1.8          {q0},     [r2], r3
717         vld1.8          {q1},     [r2], r3
718         vld1.8          {q2},     [r2], r3
719         vld1.8          {q3},     [r2], r3
720         vst1.8          {q0},     [r0,:128], r1
721         vst1.8          {q1},     [r0,:128], r1
722         vst1.8          {q2},     [r0,:128], r1
723         vst1.8          {q3},     [r0,:128], r1
724         bgt             1b
725         bx              lr
726 endfunc
727
728 function ff_put_vp8_pixels8_neon, export=1
729         ldr             r12, [sp, #0]           @ h
730 1:
731         subs            r12, r12, #4
732         vld1.8          {d0},     [r2], r3
733         vld1.8          {d1},     [r2], r3
734         vld1.8          {d2},     [r2], r3
735         vld1.8          {d3},     [r2], r3
736         vst1.8          {d0},     [r0,:64], r1
737         vst1.8          {d1},     [r0,:64], r1
738         vst1.8          {d2},     [r0,:64], r1
739         vst1.8          {d3},     [r0,:64], r1
740         bgt             1b
741         bx              lr
742 endfunc
743
744 function ff_put_vp8_pixels4_neon, export=1
745         ldr             r12, [sp, #0]           @ h
746         push            {r4-r6,lr}
747 1:
748         subs            r12, r12, #4
749         ldr             r4,       [r2], r3
750         ldr             r5,       [r2], r3
751         ldr             r6,       [r2], r3
752         ldr             lr,       [r2], r3
753         str             r4,       [r0], r1
754         str             r5,       [r0], r1
755         str             r6,       [r0], r1
756         str             lr,       [r0], r1
757         bgt             1b
758         pop             {r4-r6,pc}
759 endfunc
760
761 /* 4/6-tap 8th-pel MC */
762
763 .macro  vp8_epel8_h6    d,   a,   b
764         vext.8          d27, \a,  \b,  #1
765         vmovl.u8        q8,  \a
766         vext.8          d28, \a,  \b,  #2
767         vmovl.u8        q9,  d27
768         vext.8          d29, \a,  \b,  #3
769         vmovl.u8        q10, d28
770         vext.8          d30, \a,  \b,  #4
771         vmovl.u8        q11, d29
772         vext.8          d31, \a,  \b,  #5
773         vmovl.u8        q12, d30
774         vmul.u16        q10, q10, d0[2]
775         vmovl.u8        q13, d31
776         vmul.u16        q11, q11, d0[3]
777         vmls.u16        q10, q9,  d0[1]
778         vmls.u16        q11, q12, d1[0]
779         vmla.u16        q10, q8,  d0[0]
780         vmla.u16        q11, q13, d1[1]
781         vqadd.s16       q11, q10, q11
782         vqrshrun.s16    \d,  q11, #7
783 .endm
784
785 .macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
786         vext.8          q14, \q0, \q1, #3
787         vext.8          q15, \q0, \q1, #4
788         vmovl.u8        q11, d28
789         vmovl.u8        q14, d29
790         vext.8          q3,  \q0, \q1, #2
791         vmovl.u8        q12, d30
792         vmovl.u8        q15, d31
793         vext.8          q8,  \q0, \q1, #1
794         vmovl.u8        q10, d6
795         vmovl.u8        q3,  d7
796         vext.8          q2,  \q0, \q1, #5
797         vmovl.u8        q13, d4
798         vmovl.u8        q2,  d5
799         vmovl.u8        q9,  d16
800         vmovl.u8        q8,  d17
801         vmul.u16        q11, q11, d0[3]
802         vmul.u16        q10, q10, d0[2]
803         vmul.u16        q3,  q3,  d0[2]
804         vmul.u16        q14, q14, d0[3]
805         vmls.u16        q11, q12, d1[0]
806         vmovl.u8        q12, \s0
807         vmovl.u8        q1,  \s1
808         vmls.u16        q10, q9,  d0[1]
809         vmls.u16        q3,  q8,  d0[1]
810         vmls.u16        q14, q15, d1[0]
811         vmla.u16        q10, q12, d0[0]
812         vmla.u16        q11, q13, d1[1]
813         vmla.u16        q3,  q1,  d0[0]
814         vmla.u16        q14, q2,  d1[1]
815         vqadd.s16       q11, q10, q11
816         vqadd.s16       q14, q3,  q14
817         vqrshrun.s16    \d0, q11, #7
818         vqrshrun.s16    \d1, q14, #7
819 .endm
820
821 .macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
822         vmovl.u8        q10, \s2
823         vmovl.u8        q11, \s3
824         vmovl.u8        q9,  \s1
825         vmovl.u8        q12, \s4
826         vmovl.u8        q8,  \s0
827         vmovl.u8        q13, \s5
828         vmul.u16        q10, q10, d0[2]
829         vmul.u16        q11, q11, d0[3]
830         vmls.u16        q10, q9,  d0[1]
831         vmls.u16        q11, q12, d1[0]
832         vmla.u16        q10, q8,  d0[0]
833         vmla.u16        q11, q13, d1[1]
834         vqadd.s16       q11, q10, q11
835         vqrshrun.s16    \d0, q11, #7
836 .endm
837
838 .macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
839         vmovl.u8        q10, \s0
840         vmovl.u8        q11, \s3
841         vmovl.u8        q14, \s6
842         vmovl.u8        q9,  \s1
843         vmovl.u8        q12, \s4
844         vmovl.u8        q8,  \s2
845         vmovl.u8        q13, \s5
846         vmul.u16        q10, q10, d0[0]
847         vmul.u16        q15, q11, d0[3]
848         vmul.u16        q11, q11, d0[2]
849         vmul.u16        q14, q14, d1[1]
850         vmls.u16        q10, q9,  d0[1]
851         vmls.u16        q15, q12, d1[0]
852         vmls.u16        q11, q8,  d0[1]
853         vmls.u16        q14, q13, d1[0]
854         vmla.u16        q10, q8,  d0[2]
855         vmla.u16        q15, q13, d1[1]
856         vmla.u16        q11, q9,  d0[0]
857         vmla.u16        q14, q12, d0[3]
858         vqadd.s16       q15, q10, q15
859         vqadd.s16       q14, q11, q14
860         vqrshrun.s16    \d0, q15, #7
861         vqrshrun.s16    \d1, q14, #7
862 .endm
863
864 .macro  vp8_epel8_h4    d,   a,   b
865         vext.8          d28, \a,  \b,  #1
866         vmovl.u8        q9,  \a
867         vext.8          d29, \a,  \b,  #2
868         vmovl.u8        q10, d28
869         vext.8          d30, \a,  \b,  #3
870         vmovl.u8        q11, d29
871         vmovl.u8        q12, d30
872         vmul.u16        q10, q10, d0[2]
873         vmul.u16        q11, q11, d0[3]
874         vmls.u16        q10, q9,  d0[1]
875         vmls.u16        q11, q12, d1[0]
876         vqadd.s16       q11, q10, q11
877         vqrshrun.s16    \d,  q11, #7
878 .endm
879
880 .macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
881         vmovl.u8        q9,  \s0
882         vmovl.u8        q10, \s1
883         vmovl.u8        q11, \s2
884         vmovl.u8        q12, \s3
885         vmovl.u8        q13, \s4
886         vmul.u16        q8,  q10, d0[2]
887         vmul.u16        q14, q11, d0[3]
888         vmul.u16        q11, q11, d0[2]
889         vmul.u16        q15, q12, d0[3]
890         vmls.u16        q8,  q9,  d0[1]
891         vmls.u16        q14, q12, d1[0]
892         vmls.u16        q11, q10, d0[1]
893         vmls.u16        q15, q13, d1[0]
894         vqadd.s16       q8,  q8,  q14
895         vqadd.s16       q11, q11, q15
896         vqrshrun.s16    \d0, q8,  #7
897         vqrshrun.s16    \d1, q11, #7
898 .endm
899
900 function ff_put_vp8_epel16_v6_neon, export=1
901         sub             r2,  r2,  r3,  lsl #1
902         push            {r4,lr}
903         vpush           {d8-d15}
904
905         ldr             r4,  [sp, #80]          @ my
906         movrel          lr,  subpel_filters-16
907         ldr             r12, [sp, #72]          @ h
908         add             r4,  lr,  r4, lsl #4
909         vld1.16         {q0},     [r4,:128]
910 1:
911         vld1.8          {d2-d3},  [r2], r3
912         vld1.8          {d4-d5},  [r2], r3
913         vld1.8          {d6-d7},  [r2], r3
914         vld1.8          {d8-d9},  [r2], r3
915         vld1.8          {d10-d11},[r2], r3
916         vld1.8          {d12-d13},[r2], r3
917         vld1.8          {d14-d15},[r2]
918         sub             r2,  r2,  r3,  lsl #2
919
920         vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
921         vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
922
923         vst1.8          {d2-d3},  [r0,:128], r1
924         vst1.8          {d4-d5},  [r0,:128], r1
925         subs            r12, r12, #2
926         bne             1b
927
928         vpop            {d8-d15}
929         pop             {r4,pc}
930 endfunc
931
932 function ff_put_vp8_epel16_h6_neon, export=1
933         sub             r2,  r2,  #2
934         push            {r4,lr}
935
936         ldr             r4,  [sp, #12]          @ mx
937         movrel          lr,  subpel_filters-16
938         ldr             r12, [sp, #8]           @ h
939         add             r4,  lr,  r4, lsl #4
940         vld1.16         {q0},     [r4,:128]
941 1:
942         vld1.8          {d2-d4},  [r2], r3
943
944         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
945
946         vst1.8          {d2-d3}, [r0,:128], r1
947         subs            r12, r12, #1
948         bne             1b
949
950         pop             {r4,pc}
951 endfunc
952
953 function ff_put_vp8_epel16_h6v6_neon, export=1
954         sub             r2,  r2,  r3,  lsl #1
955         sub             r2,  r2,  #2
956         push            {r4,lr}
957         vpush           {d8-d9}
958
959         @ first pass (horizontal):
960         ldr             r4,  [sp, #28]          @ mx
961         movrel          lr,  subpel_filters-16
962         ldr             r12, [sp, #24]          @ h
963         add             r4,  lr,  r4, lsl #4
964         sub             sp,  sp,  #336+16
965         vld1.16         {q0},     [r4,:128]
966         add             lr,  sp,  #15
967         add             r12, r12, #5
968         bic             lr,  lr,  #15
969 1:
970         vld1.8          {d2,d3,d4}, [r2], r3
971
972         vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
973
974         vst1.8          {d2-d3}, [lr,:128]!
975         subs            r12, r12, #1
976         bne             1b
977
978         @ second pass (vertical):
979         ldr             r4,  [sp, #336+16+32]   @ my
980         movrel          lr,  subpel_filters-16
981         ldr             r12, [sp, #336+16+24]   @ h
982         add             r4,  lr,  r4, lsl #4
983         add             lr,  sp,  #15
984         vld1.16         {q0},     [r4,:128]
985         bic             lr,  lr,  #15
986 2:
987         vld1.8          {d2-d5},  [lr,:128]!
988         vld1.8          {d6-d9},  [lr,:128]!
989         vld1.8          {d28-d31},[lr,:128]
990         sub             lr,  lr,  #48
991
992         vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
993         vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
994
995         vst1.8          {d2-d3}, [r0,:128], r1
996         subs            r12, r12, #1
997         bne             2b
998
999         add             sp,  sp,  #336+16
1000         vpop            {d8-d9}
1001         pop             {r4,pc}
1002 endfunc
1003
1004 function ff_put_vp8_epel8_v6_neon, export=1
1005         sub             r2,  r2,  r3,  lsl #1
1006         push            {r4,lr}
1007
1008         ldr             r4,  [sp, #16]          @ my
1009         movrel          lr,  subpel_filters-16
1010         ldr             r12, [sp, #8]           @ h
1011         add             r4,  lr,  r4, lsl #4
1012         vld1.16         {q0},     [r4,:128]
1013 1:
1014         vld1.8          {d2},  [r2], r3
1015         vld1.8          {d3},  [r2], r3
1016         vld1.8          {d4},  [r2], r3
1017         vld1.8          {d5},  [r2], r3
1018         vld1.8          {d6},  [r2], r3
1019         vld1.8          {d7},  [r2], r3
1020         vld1.8          {d28}, [r2]
1021
1022         sub             r2,  r2,  r3,  lsl #2
1023
1024         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1025
1026         vst1.8          {d2}, [r0,:64], r1
1027         vst1.8          {d3}, [r0,:64], r1
1028         subs            r12, r12, #2
1029         bne             1b
1030
1031         pop             {r4,pc}
1032 endfunc
1033
1034 function ff_put_vp8_epel8_h6_neon, export=1
1035         sub             r2,  r2,  #2
1036         push            {r4,lr}
1037
1038         ldr             r4,  [sp, #12]          @ mx
1039         movrel          lr,  subpel_filters-16
1040         ldr             r12, [sp, #8]           @ h
1041         add             r4,  lr,  r4, lsl #4
1042         vld1.16         {q0},     [r4,:128]
1043 1:
1044         vld1.8          {d2,d3}, [r2], r3
1045
1046         vp8_epel8_h6    d2,  d2,  d3
1047
1048         vst1.8          {d2}, [r0,:64], r1
1049         subs            r12, r12, #1
1050         bne             1b
1051
1052         pop             {r4,pc}
1053 endfunc
1054
1055 function ff_put_vp8_epel8_h6v6_neon, export=1
1056         sub             r2,  r2,  r3,  lsl #1
1057         sub             r2,  r2,  #2
1058         push            {r4,lr}
1059
1060         @ first pass (horizontal):
1061         ldr             r4,  [sp, #12]          @ mx
1062         movrel          lr,  subpel_filters-16
1063         ldr             r12, [sp, #8]           @ h
1064         add             r4,  lr,  r4, lsl #4
1065         sub             sp,  sp,  #168+16
1066         vld1.16         {q0},     [r4,:128]
1067         add             lr,  sp,  #15
1068         add             r12, r12, #5
1069         bic             lr,  lr,  #15
1070 1:
1071         vld1.8          {d2,d3}, [r2], r3
1072
1073         vp8_epel8_h6    d2,  d2,  d3
1074
1075         vst1.8          {d2}, [lr,:64]!
1076         subs            r12, r12, #1
1077         bne             1b
1078
1079         @ second pass (vertical):
1080         ldr             r4,  [sp, #168+16+16]   @ my
1081         movrel          lr,  subpel_filters-16
1082         ldr             r12, [sp, #168+16+8]    @ h
1083         add             r4,  lr,  r4, lsl #4
1084         add             lr,  sp,  #15
1085         vld1.16         {q0},     [r4,:128]
1086         bic             lr,  lr,  #15
1087 2:
1088         vld1.8          {d2-d5},  [lr,:128]!
1089         vld1.8          {d6-d7},  [lr,:128]!
1090         vld1.8          {d30},    [lr,:64]
1091         sub             lr,  lr,  #32
1092
1093         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1094
1095         vst1.8          {d2}, [r0,:64], r1
1096         vst1.8          {d3}, [r0,:64], r1
1097         subs            r12, r12, #2
1098         bne             2b
1099
1100         add             sp,  sp,  #168+16
1101         pop             {r4,pc}
1102 endfunc
1103
1104 function ff_put_vp8_epel8_v4_neon, export=1
1105         sub             r2,  r2,  r3
1106         push            {r4,lr}
1107
1108         ldr             r4,  [sp, #16]          @ my
1109         movrel          lr,  subpel_filters-16
1110         ldr             r12, [sp, #8]           @ h
1111         add             r4,  lr,  r4, lsl #4
1112         vld1.16         {q0},     [r4,:128]
1113 1:
1114         vld1.8          {d2},     [r2], r3
1115         vld1.8          {d3},     [r2], r3
1116         vld1.8          {d4},     [r2], r3
1117         vld1.8          {d5},     [r2], r3
1118         vld1.8          {d6},     [r2]
1119         sub             r2,  r2,  r3,  lsl #1
1120
1121         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1122
1123         vst1.8          {d2}, [r0,:64], r1
1124         vst1.8          {d3}, [r0,:64], r1
1125         subs            r12, r12, #2
1126         bne             1b
1127
1128         pop             {r4,pc}
1129 endfunc
1130
1131 function ff_put_vp8_epel8_h4_neon, export=1
1132         sub             r2,  r2,  #1
1133         push            {r4,lr}
1134
1135         ldr             r4,  [sp, #12]          @ mx
1136         movrel          lr,  subpel_filters-16
1137         ldr             r12, [sp, #8]           @ h
1138         add             r4,  lr,  r4, lsl #4
1139         vld1.16         {q0},     [r4,:128]
1140 1:
1141         vld1.8          {d2,d3}, [r2], r3
1142
1143         vp8_epel8_h4    d2,  d2,  d3
1144
1145         vst1.8          {d2}, [r0,:64], r1
1146         subs            r12, r12, #1
1147         bne             1b
1148
1149         pop             {r4,pc}
1150 endfunc
1151
1152 function ff_put_vp8_epel8_h4v4_neon, export=1
1153         sub             r2,  r2,  r3
1154         sub             r2,  r2,  #1
1155         push            {r4,lr}
1156
1157         @ first pass (horizontal):
1158         ldr             r4,  [sp, #12]          @ mx
1159         movrel          lr,  subpel_filters-16
1160         ldr             r12, [sp, #8]           @ h
1161         add             r4,  lr,  r4, lsl #4
1162         sub             sp,  sp,  #168+16
1163         vld1.16         {q0},     [r4,:128]
1164         add             lr,  sp,  #15
1165         add             r12, r12, #3
1166         bic             lr,  lr,  #15
1167 1:
1168         vld1.8          {d2,d3}, [r2], r3
1169
1170         vp8_epel8_h4    d2,  d2,  d3
1171
1172         vst1.8          {d2}, [lr,:64]!
1173         subs            r12, r12, #1
1174         bne             1b
1175
1176         @ second pass (vertical):
1177         ldr             r4,  [sp, #168+16+16]   @ my
1178         movrel          lr,  subpel_filters-16
1179         ldr             r12, [sp, #168+16+8]    @ h
1180         add             r4,  lr,  r4, lsl #4
1181         add             lr,  sp,  #15
1182         vld1.16         {q0},     [r4,:128]
1183         bic             lr,  lr,  #15
1184 2:
1185         vld1.8          {d2-d5},  [lr,:128]!
1186         vld1.8          {d6},     [lr,:64]
1187         sub             lr,  lr,  #16
1188
1189         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1190
1191         vst1.8          {d2},     [r0,:64], r1
1192         vst1.8          {d3},     [r0,:64], r1
1193         subs            r12, r12, #2
1194         bne             2b
1195
1196         add             sp,  sp,  #168+16
1197         pop             {r4,pc}
1198 endfunc
1199
1200 function ff_put_vp8_epel8_h6v4_neon, export=1
1201         sub             r2,  r2,  r3
1202         sub             r2,  r2,  #2
1203         push            {r4,lr}
1204
1205         @ first pass (horizontal):
1206         ldr             r4,  [sp, #12]          @ mx
1207         movrel          lr,  subpel_filters-16
1208         ldr             r12, [sp, #8]           @ h
1209         add             r4,  lr,  r4, lsl #4
1210         sub             sp,  sp,  #168+16
1211         vld1.16         {q0},     [r4,:128]
1212         add             lr,  sp,  #15
1213         add             r12, r12, #3
1214         bic             lr,  lr,  #15
1215 1:
1216         vld1.8          {d2,d3}, [r2], r3
1217
1218         vp8_epel8_h6    d2,  d2,  d3
1219
1220         vst1.8          {d2}, [lr,:64]!
1221         subs            r12, r12, #1
1222         bne             1b
1223
1224         @ second pass (vertical):
1225         ldr             r4,  [sp, #168+16+16]   @ my
1226         movrel          lr,  subpel_filters-16
1227         ldr             r12, [sp, #168+16+8]    @ h
1228         add             r4,  lr,  r4, lsl #4
1229         add             lr,  sp,  #15
1230         vld1.16         {q0},     [r4,:128]
1231         bic             lr,  lr,  #15
1232 2:
1233         vld1.8          {d2-d5},  [lr,:128]!
1234         vld1.8          {d6},     [lr,:64]
1235         sub             lr,  lr,  #16
1236
1237         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1238
1239         vst1.8          {d2},     [r0,:64], r1
1240         vst1.8          {d3},     [r0,:64], r1
1241         subs            r12, r12, #2
1242         bne             2b
1243
1244         add             sp,  sp,  #168+16
1245         pop             {r4,pc}
1246 endfunc
1247
1248 function ff_put_vp8_epel8_h4v6_neon, export=1
1249         sub             r2,  r2,  r3,  lsl #1
1250         sub             r2,  r2,  #1
1251         push            {r4,lr}
1252
1253         @ first pass (horizontal):
1254         ldr             r4,  [sp, #12]          @ mx
1255         movrel          lr,  subpel_filters-16
1256         ldr             r12, [sp, #8]           @ h
1257         add             r4,  lr,  r4, lsl #4
1258         sub             sp,  sp,  #168+16
1259         vld1.16         {q0},     [r4,:128]
1260         add             lr,  sp,  #15
1261         add             r12, r12, #5
1262         bic             lr,  lr,  #15
1263 1:
1264         vld1.8          {d2,d3}, [r2], r3
1265
1266         vp8_epel8_h4    d2,  d2,  d3
1267
1268         vst1.8          {d2}, [lr,:64]!
1269         subs            r12, r12, #1
1270         bne             1b
1271
1272         @ second pass (vertical):
1273         ldr             r4,  [sp, #168+16+16]   @ my
1274         movrel          lr,  subpel_filters-16
1275         ldr             r12, [sp, #168+16+8]    @ h
1276         add             r4,  lr,  r4, lsl #4
1277         add             lr,  sp,  #15
1278         vld1.16         {q0},     [r4,:128]
1279         bic             lr,  lr,  #15
1280 2:
1281         vld1.8          {d2-d5},  [lr,:128]!
1282         vld1.8          {d6-d7},  [lr,:128]!
1283         vld1.8          {d30},    [lr,:64]
1284         sub             lr,  lr,  #32
1285
1286         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1287
1288         vst1.8          {d2}, [r0,:64], r1
1289         vst1.8          {d3}, [r0,:64], r1
1290         subs            r12, r12, #2
1291         bne             2b
1292
1293         add             sp,  sp,  #168+16
1294         pop             {r4,pc}
1295 endfunc
1296
1297 .ltorg
1298
1299 function ff_put_vp8_epel4_v6_neon, export=1
1300         sub             r2,  r2,  r3,  lsl #1
1301         push            {r4,lr}
1302
1303         ldr             r4,  [sp, #16]          @ my
1304         movrel          lr,  subpel_filters-16
1305         ldr             r12, [sp, #8]           @ h
1306         add             r4,  lr,  r4, lsl #4
1307         vld1.16         {q0},     [r4,:128]
1308 1:
1309         vld1.32         {d2[]},   [r2], r3
1310         vld1.32         {d3[]},   [r2], r3
1311         vld1.32         {d4[]},   [r2], r3
1312         vld1.32         {d5[]},   [r2], r3
1313         vld1.32         {d6[]},   [r2], r3
1314         vld1.32         {d7[]},   [r2], r3
1315         vld1.32         {d28[]},  [r2]
1316         sub             r2,  r2,  r3,  lsl #2
1317         vld1.32         {d2[1]},  [r2], r3
1318         vld1.32         {d3[1]},  [r2], r3
1319         vld1.32         {d4[1]},  [r2], r3
1320         vld1.32         {d5[1]},  [r2], r3
1321         vld1.32         {d6[1]},  [r2], r3
1322         vld1.32         {d7[1]},  [r2], r3
1323         vld1.32         {d28[1]}, [r2]
1324         sub             r2,  r2,  r3,  lsl #2
1325
1326         vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1327
1328         vst1.32         {d2[0]},  [r0,:32], r1
1329         vst1.32         {d3[0]},  [r0,:32], r1
1330         vst1.32         {d2[1]},  [r0,:32], r1
1331         vst1.32         {d3[1]},  [r0,:32], r1
1332         subs            r12, r12, #4
1333         bne             1b
1334
1335         pop             {r4,pc}
1336 endfunc
1337
1338 function ff_put_vp8_epel4_h6_neon, export=1
1339         sub             r2,  r2,  #2
1340         push            {r4,lr}
1341
1342         ldr             r4,  [sp, #12]          @ mx
1343         movrel          lr,  subpel_filters-16
1344         ldr             r12, [sp, #8]           @ h
1345         add             r4,  lr,  r4, lsl #4
1346         vld1.16         {q0},     [r4,:128]
1347 1:
1348         vld1.8          {q1},     [r2], r3
1349         vp8_epel8_h6    d2,  d2,  d3
1350         vst1.32         {d2[0]},  [r0,:32], r1
1351         subs            r12, r12, #1
1352         bne             1b
1353
1354         pop             {r4,pc}
1355 endfunc
1356
1357 function ff_put_vp8_epel4_h6v6_neon, export=1
1358         sub             r2,  r2,  r3,  lsl #1
1359         sub             r2,  r2,  #2
1360         push            {r4,lr}
1361
1362         ldr             r4,  [sp, #12]          @ mx
1363         movrel          lr,  subpel_filters-16
1364         ldr             r12, [sp, #8]           @ h
1365         add             r4,  lr,  r4, lsl #4
1366         sub             sp,  sp,  #52+16
1367         vld1.16         {q0},     [r4,:128]
1368         add             lr,  sp,  #15
1369         add             r12, r12, #5
1370         bic             lr,  lr,  #15
1371 1:
1372         vld1.8          {q1},     [r2], r3
1373         vp8_epel8_h6    d2,  d2,  d3
1374         vst1.32         {d2[0]},  [lr,:32]!
1375         subs            r12, r12, #1
1376         bne             1b
1377
1378         ldr             r4,  [sp, #52+16+16]    @ my
1379         movrel          lr,  subpel_filters-16
1380         ldr             r12, [sp, #52+16+8]     @ h
1381         add             r4,  lr,  r4, lsl #4
1382         add             lr,  sp,  #15
1383         vld1.16         {q0},     [r4,:128]
1384         bic             lr,  lr,  #15
1385 2:
1386         vld1.8          {d2-d3},  [lr,:128]!
1387         vld1.8          {d6},     [lr,:64]!
1388         vld1.32         {d28[]},  [lr,:32]
1389         sub             lr,  lr,  #16
1390         vld1.8          {d4-d5},  [lr]!
1391         vld1.8          {d7},     [lr,:64]!
1392         vld1.32         {d28[1]}, [lr,:32]
1393         sub             lr,  lr,  #16
1394         vtrn.32         q1,  q2
1395         vtrn.32         d6,  d7
1396         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1397         vst1.32         {d2[0]},  [r0,:32], r1
1398         vst1.32         {d3[0]},  [r0,:32], r1
1399         vst1.32         {d2[1]},  [r0,:32], r1
1400         vst1.32         {d3[1]},  [r0,:32], r1
1401         subs            r12, r12, #4
1402         bne             2b
1403
1404         add             sp,  sp,  #52+16
1405         pop             {r4,pc}
1406 endfunc
1407
1408 function ff_put_vp8_epel4_h4v6_neon, export=1
1409         sub             r2,  r2,  r3,  lsl #1
1410         sub             r2,  r2,  #1
1411         push            {r4,lr}
1412
1413         ldr             r4,  [sp, #12]          @ mx
1414         movrel          lr,  subpel_filters-16
1415         ldr             r12, [sp, #8]           @ h
1416         add             r4,  lr,  r4, lsl #4
1417         sub             sp,  sp,  #52+16
1418         vld1.16         {q0},     [r4,:128]
1419         add             lr,  sp,  #15
1420         add             r12, r12, #5
1421         bic             lr,  lr,  #15
1422 1:
1423         vld1.8          {d2},     [r2], r3
1424         vp8_epel8_h4    d2,  d2,  d2
1425         vst1.32         {d2[0]},  [lr,:32]!
1426         subs            r12, r12, #1
1427         bne             1b
1428
1429         ldr             r4,  [sp, #52+16+16]    @ my
1430         movrel          lr,  subpel_filters-16
1431         ldr             r12, [sp, #52+16+8]     @ h
1432         add             r4,  lr,  r4, lsl #4
1433         add             lr,  sp,  #15
1434         vld1.16         {q0},     [r4,:128]
1435         bic             lr,  lr,  #15
1436 2:
1437         vld1.8          {d2-d3},  [lr,:128]!
1438         vld1.8          {d6},     [lr,:64]!
1439         vld1.32         {d28[]},  [lr,:32]
1440         sub             lr,  lr,  #16
1441         vld1.8          {d4-d5},  [lr]!
1442         vld1.8          {d7},     [lr,:64]!
1443         vld1.32         {d28[1]}, [lr,:32]
1444         sub             lr,  lr,  #16
1445         vtrn.32         q1,  q2
1446         vtrn.32         d6,  d7
1447         vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1448         vst1.32         {d2[0]},  [r0,:32], r1
1449         vst1.32         {d3[0]},  [r0,:32], r1
1450         vst1.32         {d2[1]},  [r0,:32], r1
1451         vst1.32         {d3[1]},  [r0,:32], r1
1452         subs            r12, r12, #4
1453         bne             2b
1454
1455         add             sp,  sp,  #52+16
1456         pop             {r4,pc}
1457 endfunc
1458
1459 function ff_put_vp8_epel4_h6v4_neon, export=1
1460         sub             r2,  r2,  r3
1461         sub             r2,  r2,  #2
1462         push            {r4,lr}
1463
1464         ldr             r4,  [sp, #12]          @ mx
1465         movrel          lr,  subpel_filters-16
1466         ldr             r12, [sp, #8]           @ h
1467         add             r4,  lr,  r4, lsl #4
1468         sub             sp,  sp,  #44+16
1469         vld1.16         {q0},     [r4,:128]
1470         add             lr,  sp,  #15
1471         add             r12, r12, #3
1472         bic             lr,  lr,  #15
1473 1:
1474         vld1.8          {q1},     [r2], r3
1475         vp8_epel8_h6    d2,  d2,  d3
1476         vst1.32         {d2[0]},  [lr,:32]!
1477         subs            r12, r12, #1
1478         bne             1b
1479
1480         ldr             r4,  [sp, #44+16+16]    @ my
1481         movrel          lr,  subpel_filters-16
1482         ldr             r12, [sp, #44+16+8]     @ h
1483         add             r4,  lr,  r4, lsl #4
1484         add             lr,  sp,  #15
1485         vld1.16         {q0},     [r4,:128]
1486         bic             lr,  lr,  #15
1487 2:
1488         vld1.8          {d2-d3},  [lr,:128]!
1489         vld1.32         {d6[]},   [lr,:32]
1490         sub             lr,  lr,  #8
1491         vld1.8          {d4-d5},  [lr]!
1492         vld1.32         {d6[1]},  [lr,:32]
1493         sub             lr,  lr,  #8
1494         vtrn.32         q1,  q2
1495         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1496         vst1.32         {d2[0]},  [r0,:32], r1
1497         vst1.32         {d3[0]},  [r0,:32], r1
1498         vst1.32         {d2[1]},  [r0,:32], r1
1499         vst1.32         {d3[1]},  [r0,:32], r1
1500         subs            r12, r12, #4
1501         bne             2b
1502
1503         add             sp,  sp,  #44+16
1504         pop             {r4,pc}
1505 endfunc
1506
1507 function ff_put_vp8_epel4_h4_neon, export=1
1508         sub             r2,  r2,  #1
1509         push            {r4,lr}
1510
1511         ldr             r4,  [sp, #12]          @ mx
1512         movrel          lr,  subpel_filters-16
1513         ldr             r12, [sp, #8]           @ h
1514         add             r4,  lr,  r4, lsl #4
1515         vld1.16         {q0},     [r4,:128]
1516 1:
1517         vld1.8          {d2},     [r2], r3
1518         vp8_epel8_h4    d2,  d2,  d2
1519         vst1.32         {d2[0]},  [r0,:32], r1
1520         subs            r12, r12, #1
1521         bne             1b
1522
1523         pop             {r4,pc}
1524 endfunc
1525
1526 function ff_put_vp8_epel4_v4_neon, export=1
1527         sub             r2,  r2,  r3
1528         push            {r4,lr}
1529
1530         ldr             r4,  [sp, #16]          @ my
1531         movrel          lr,  subpel_filters-16
1532         ldr             r12, [sp, #8]           @ h
1533         add             r4,  lr,  r4, lsl #4
1534         vld1.16         {q0},     [r4,:128]
1535 1:
1536         vld1.32         {d2[]},   [r2], r3
1537         vld1.32         {d3[]},   [r2], r3
1538         vld1.32         {d4[]},   [r2], r3
1539         vld1.32         {d5[]},   [r2], r3
1540         vld1.32         {d6[]},   [r2]
1541         sub             r2,  r2,  r3,  lsl #1
1542         vld1.32         {d2[1]},  [r2], r3
1543         vld1.32         {d3[1]},  [r2], r3
1544         vld1.32         {d4[1]},  [r2], r3
1545         vld1.32         {d5[1]},  [r2], r3
1546         vld1.32         {d6[1]},  [r2]
1547         sub             r2,  r2,  r3,  lsl #1
1548
1549         vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1550
1551         vst1.32         {d2[0]},  [r0,:32], r1
1552         vst1.32         {d3[0]},  [r0,:32], r1
1553         vst1.32         {d2[1]},  [r0,:32], r1
1554         vst1.32         {d3[1]},  [r0,:32], r1
1555         subs            r12, r12, #4
1556         bne             1b
1557
1558         pop             {r4,pc}
1559 endfunc
1560
1561 function ff_put_vp8_epel4_h4v4_neon, export=1
1562         sub             r2,  r2,  r3
1563         sub             r2,  r2,  #1
1564         push            {r4,lr}
1565
1566         ldr             r4,  [sp, #12]          @ mx
1567         movrel          lr,  subpel_filters-16
1568         ldr             r12, [sp, #8]           @ h
1569         add             r4,  lr,  r4, lsl #4
1570         sub             sp,  sp,  #44+16
1571         vld1.16         {q0},     [r4,:128]
1572         add             lr,  sp,  #15
1573         add             r12, r12, #3
1574         bic             lr,  lr,  #15
1575 1:
1576         vld1.8          {d2},     [r2], r3
1577         vp8_epel8_h4    d2,  d2,  d3
1578         vst1.32         {d2[0]},  [lr,:32]!
1579         subs            r12, r12, #1
1580         bne             1b
1581
1582         ldr             r4,  [sp, #44+16+16]    @ my
1583         movrel          lr,  subpel_filters-16
1584         ldr             r12, [sp, #44+16+8]     @ h
1585         add             r4,  lr,  r4, lsl #4
1586         add             lr,  sp,  #15
1587         vld1.16         {q0},     [r4,:128]
1588         bic             lr,  lr,  #15
1589 2:
1590         vld1.8          {d2-d3},  [lr,:128]!
1591         vld1.32         {d6[]},   [lr,:32]
1592         sub             lr,  lr,  #8
1593         vld1.8          {d4-d5},  [lr]!
1594         vld1.32         {d6[1]},  [lr,:32]
1595         sub             lr,  lr,  #8
1596         vtrn.32         q1,  q2
1597         vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1598         vst1.32         {d2[0]},  [r0,:32], r1
1599         vst1.32         {d3[0]},  [r0,:32], r1
1600         vst1.32         {d2[1]},  [r0,:32], r1
1601         vst1.32         {d3[1]},  [r0,:32], r1
1602         subs            r12, r12, #4
1603         bne             2b
1604
1605         add             sp,  sp,  #44+16
1606         pop             {r4,pc}
1607 endfunc
1608
1609 @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1610 @ arithmatic can be used to apply filters
1611 const   subpel_filters, align=4
1612         .short     0,   6, 123,  12,   1,   0,   0,   0
1613         .short     2,  11, 108,  36,   8,   1,   0,   0
1614         .short     0,   9,  93,  50,   6,   0,   0,   0
1615         .short     3,  16,  77,  77,  16,   3,   0,   0
1616         .short     0,   6,  50,  93,   9,   0,   0,   0
1617         .short     1,   8,  36, 108,  11,   2,   0,   0
1618         .short     0,   1,  12, 123,   6,   0,   0,   0
1619 endconst
1620
1621 /* Bilinear MC */
1622
1623 function ff_put_vp8_bilin16_h_neon, export=1
1624         ldr             r3,  [sp, #4]           @ mx
1625         rsb             r12, r3,  #8
1626         vdup.8          d0,  r3
1627         vdup.8          d1,  r12
1628         ldr             r12, [sp]               @ h
1629 1:
1630         subs            r12, r12, #2
1631         vld1.8          {d2-d4},  [r2], r1
1632         vext.8          q2,  q1,  q2,  #1
1633         vmull.u8        q8,  d2,  d1
1634         vmlal.u8        q8,  d4,  d0
1635         vld1.8          {d18-d20},[r2], r1
1636         vmull.u8        q3,  d3,  d1
1637         vmlal.u8        q3,  d5,  d0
1638         vext.8          q10, q9,  q10, #1
1639         vmull.u8        q11, d18, d1
1640         vmlal.u8        q11, d20, d0
1641         vmull.u8        q12, d19, d1
1642         vmlal.u8        q12, d21, d0
1643         vrshrn.u16      d4,  q8,  #3
1644         vrshrn.u16      d5,  q3,  #3
1645         vrshrn.u16      d6,  q11, #3
1646         vrshrn.u16      d7,  q12, #3
1647         vst1.8          {q2},     [r0,:128], r1
1648         vst1.8          {q3},     [r0,:128], r1
1649         bgt             1b
1650
1651         bx              lr
1652 endfunc
1653
1654 function ff_put_vp8_bilin16_v_neon, export=1
1655         ldr             r3,  [sp, #8]           @ my
1656         rsb             r12, r3,  #8
1657         vdup.8          d0,  r3
1658         vdup.8          d1,  r12
1659         ldr             r12, [sp]               @ h
1660         vld1.8          {q1},     [r2], r1
1661 1:
1662         subs            r12, r12, #2
1663         vld1.8          {q2},     [r2], r1
1664         vmull.u8        q3,  d2,  d1
1665         vmlal.u8        q3,  d4,  d0
1666         vmull.u8        q8,  d3,  d1
1667         vmlal.u8        q8,  d5,  d0
1668         vld1.8          {q1},     [r2], r1
1669         vmull.u8        q9,  d4,  d1
1670         vmlal.u8        q9,  d2,  d0
1671         vmull.u8        q10, d5,  d1
1672         vmlal.u8        q10, d3,  d0
1673         vrshrn.u16      d4,  q3,  #3
1674         vrshrn.u16      d5,  q8,  #3
1675         vrshrn.u16      d6,  q9,  #3
1676         vrshrn.u16      d7,  q10, #3
1677         vst1.8          {q2},     [r0,:128], r1
1678         vst1.8          {q3},     [r0,:128], r1
1679         bgt             1b
1680
1681         bx              lr
1682 endfunc
1683
1684 function ff_put_vp8_bilin16_hv_neon, export=1
1685         ldr             r3,  [sp, #4]           @ mx
1686         rsb             r12, r3,  #8
1687         vdup.8          d0,  r3
1688         vdup.8          d1,  r12
1689         ldr             r3,  [sp, #8]           @ my
1690         rsb             r12, r3,  #8
1691         vdup.8          d2,  r3
1692         vdup.8          d3,  r12
1693         ldr             r12, [sp]               @ h
1694
1695         vld1.8          {d4-d6},  [r2], r1
1696         vext.8          q3,  q2,  q3,  #1
1697         vmull.u8        q8,  d4,  d1
1698         vmlal.u8        q8,  d6,  d0
1699         vmull.u8        q9,  d5,  d1
1700         vmlal.u8        q9,  d7,  d0
1701         vrshrn.u16      d4,  q8,  #3
1702         vrshrn.u16      d5,  q9,  #3
1703 1:
1704         subs            r12, r12, #2
1705         vld1.8          {d18-d20},[r2], r1
1706         vext.8          q10, q9,  q10, #1
1707         vmull.u8        q11, d18, d1
1708         vmlal.u8        q11, d20, d0
1709         vld1.8          {d26-d28},[r2], r1
1710         vmull.u8        q12, d19, d1
1711         vmlal.u8        q12, d21, d0
1712         vext.8          q14, q13, q14, #1
1713         vmull.u8        q8,  d26, d1
1714         vmlal.u8        q8,  d28, d0
1715         vmull.u8        q9,  d27, d1
1716         vmlal.u8        q9,  d29, d0
1717         vrshrn.u16      d6,  q11, #3
1718         vrshrn.u16      d7,  q12, #3
1719         vmull.u8        q12, d4,  d3
1720         vmlal.u8        q12, d6,  d2
1721         vmull.u8        q15, d5,  d3
1722         vmlal.u8        q15, d7,  d2
1723         vrshrn.u16      d4,  q8,  #3
1724         vrshrn.u16      d5,  q9,  #3
1725         vmull.u8        q10, d6,  d3
1726         vmlal.u8        q10, d4,  d2
1727         vmull.u8        q11, d7,  d3
1728         vmlal.u8        q11, d5,  d2
1729         vrshrn.u16      d24, q12, #3
1730         vrshrn.u16      d25, q15, #3
1731         vst1.8          {q12},    [r0,:128], r1
1732         vrshrn.u16      d20, q10, #3
1733         vrshrn.u16      d21, q11, #3
1734         vst1.8          {q10},    [r0,:128], r1
1735         bgt             1b
1736
1737         bx              lr
1738 endfunc
1739
1740 function ff_put_vp8_bilin8_h_neon, export=1
1741         ldr             r3,  [sp, #4]           @ mx
1742         rsb             r12, r3,  #8
1743         vdup.8          d0,  r3
1744         vdup.8          d1,  r12
1745         ldr             r12, [sp]               @ h
1746 1:
1747         subs            r12, r12, #2
1748         vld1.8          {q1},     [r2], r1
1749         vext.8          d3,  d2,  d3,  #1
1750         vmull.u8        q2,  d2,  d1
1751         vmlal.u8        q2,  d3,  d0
1752         vld1.8          {q3},     [r2], r1
1753         vext.8          d7,  d6,  d7,  #1
1754         vmull.u8        q8,  d6,  d1
1755         vmlal.u8        q8,  d7,  d0
1756         vrshrn.u16      d4,  q2,  #3
1757         vrshrn.u16      d16, q8,  #3
1758         vst1.8          {d4},     [r0,:64], r1
1759         vst1.8          {d16},    [r0,:64], r1
1760         bgt             1b
1761
1762         bx              lr
1763 endfunc
1764
1765 function ff_put_vp8_bilin8_v_neon, export=1
1766         ldr             r3,  [sp, #8]           @ my
1767         rsb             r12, r3,  #8
1768         vdup.8          d0,  r3
1769         vdup.8          d1,  r12
1770         ldr             r12, [sp]               @ h
1771         vld1.8          {d2},     [r2], r1
1772 1:
1773         subs            r12, r12, #2
1774         vld1.8          {d3},     [r2], r1
1775         vmull.u8        q2,  d2,  d1
1776         vmlal.u8        q2,  d3,  d0
1777         vld1.8          {d2},     [r2], r1
1778         vmull.u8        q3,  d3,  d1
1779         vmlal.u8        q3,  d2,  d0
1780         vrshrn.u16      d4,  q2,  #3
1781         vrshrn.u16      d6,  q3,  #3
1782         vst1.8          {d4},     [r0,:64], r1
1783         vst1.8          {d6},     [r0,:64], r1
1784         bgt             1b
1785
1786         bx              lr
1787 endfunc
1788
1789 function ff_put_vp8_bilin8_hv_neon, export=1
1790         ldr             r3,  [sp, #4]           @ mx
1791         rsb             r12, r3,  #8
1792         vdup.8          d0,  r3
1793         vdup.8          d1,  r12
1794         ldr             r3,  [sp, #8]           @ my
1795         rsb             r12, r3,  #8
1796         vdup.8          d2,  r3
1797         vdup.8          d3,  r12
1798         ldr             r12, [sp]               @ h
1799
1800         vld1.8          {q2},     [r2], r1
1801         vext.8          d5,  d4,  d5,  #1
1802         vmull.u8        q9,  d4,  d1
1803         vmlal.u8        q9,  d5,  d0
1804         vrshrn.u16      d22, q9,  #3
1805 1:
1806         subs            r12, r12, #2
1807         vld1.8          {q3},     [r2], r1
1808         vext.8          d7,  d6,  d7,  #1
1809         vmull.u8        q8,  d6,  d1
1810         vmlal.u8        q8,  d7,  d0
1811         vld1.8          {q2},     [r2], r1
1812         vext.8          d5,  d4,  d5,  #1
1813         vmull.u8        q9,  d4,  d1
1814         vmlal.u8        q9,  d5,  d0
1815         vrshrn.u16      d16, q8,  #3
1816         vmull.u8        q10, d22, d3
1817         vmlal.u8        q10, d16, d2
1818         vrshrn.u16      d22, q9,  #3
1819         vmull.u8        q12, d16, d3
1820         vmlal.u8        q12, d22, d2
1821         vrshrn.u16      d20, q10, #3
1822         vst1.8          {d20},    [r0,:64], r1
1823         vrshrn.u16      d23, q12, #3
1824         vst1.8          {d23},    [r0,:64], r1
1825         bgt             1b
1826
1827         bx              lr
1828 endfunc
1829
1830 function ff_put_vp8_bilin4_h_neon, export=1
1831         ldr             r3,  [sp, #4]           @ mx
1832         rsb             r12, r3,  #8
1833         vdup.8          d0,  r3
1834         vdup.8          d1,  r12
1835         ldr             r12, [sp]               @ h
1836 1:
1837         subs            r12, r12, #2
1838         vld1.8          {d2},     [r2], r1
1839         vext.8          d3,  d2,  d3,  #1
1840         vld1.8          {d6},     [r2], r1
1841         vext.8          d7,  d6,  d7,  #1
1842         vtrn.32         q1,  q3
1843         vmull.u8        q2,  d2,  d1
1844         vmlal.u8        q2,  d3,  d0
1845         vrshrn.u16      d4,  q2,  #3
1846         vst1.32         {d4[0]},  [r0,:32], r1
1847         vst1.32         {d4[1]}, [r0,:32], r1
1848         bgt             1b
1849
1850         bx              lr
1851 endfunc
1852
1853 function ff_put_vp8_bilin4_v_neon, export=1
1854         ldr             r3,  [sp, #8]           @ my
1855         rsb             r12, r3,  #8
1856         vdup.8          d0,  r3
1857         vdup.8          d1,  r12
1858         ldr             r12, [sp]               @ h
1859         vld1.32         {d2[]},   [r2], r1
1860 1:
1861         vld1.32         {d3[]},   [r2]
1862         vld1.32         {d2[1]},  [r2], r1
1863         vld1.32         {d3[1]},  [r2], r1
1864         vmull.u8        q2,  d2,  d1
1865         vmlal.u8        q2,  d3,  d0
1866         vtrn.32         d3,  d2
1867         vrshrn.u16      d4,  q2,  #3
1868         vst1.32         {d4[0]},  [r0,:32], r1
1869         vst1.32         {d4[1]},  [r0,:32], r1
1870         subs            r12, r12, #2
1871         bgt             1b
1872
1873         bx              lr
1874 endfunc
1875
1876 function ff_put_vp8_bilin4_hv_neon, export=1
1877         ldr             r3,  [sp, #4]           @ mx
1878         rsb             r12, r3,  #8
1879         vdup.8          d0,  r3
1880         vdup.8          d1,  r12
1881         ldr             r3,  [sp, #8]           @ my
1882         rsb             r12, r3,  #8
1883         vdup.8          d2,  r3
1884         vdup.8          d3,  r12
1885         ldr             r12, [sp]               @ h
1886
1887         vld1.8          {d4},     [r2], r1
1888         vext.8          d5,  d4,  d4,  #1
1889         vmull.u8        q9,  d4,  d1
1890         vmlal.u8        q9,  d5,  d0
1891         vrshrn.u16      d22, q9,  #3
1892 1:
1893         subs            r12, r12, #2
1894         vld1.8          {d6},     [r2], r1
1895         vext.8          d7,  d6,  d6,  #1
1896         vld1.8          {d4},     [r2], r1
1897         vext.8          d5,  d4,  d4,  #1
1898         vtrn.32         q3,  q2
1899         vmull.u8        q8,  d6,  d1
1900         vmlal.u8        q8,  d7,  d0
1901         vrshrn.u16      d16, q8,  #3
1902         vmull.u8        q10, d16, d2
1903         vtrn.32         d22, d16
1904         vmlal.u8        q10, d22, d3
1905         vrev64.32       d22, d16
1906         vrshrn.u16      d20, q10, #3
1907         vst1.32         {d20[0]}, [r0,:32], r1
1908         vst1.32         {d20[1]}, [r0,:32], r1
1909         bgt             1b
1910
1911         bx              lr
1912 endfunc