Removed build warnings
[platform/adaptation/ap_samsung/libexynos-common.git] / libswconverter / csc_tiled_to_linear_crop_neon.s
1 /*
2  *
3  * Copyright 2012 Samsung Electronics S.LSI Co. LTD
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License")
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 /*
19  * @file    csc_tiled_to_linear_crop_neon.s
20  * @brief   SEC_OMX specific define
21  * @author  ShinWon Lee (shinwon.lee@samsung.com)
22  * @version 1.0
23  * @history
24  *   2012.02.01 : Create
25  */
26
27 /*
28  * Converts tiled data to linear
29  * Crops left, top, right, buttom
30  * 1. Y of NV12T to Y of YUV420P
31  * 2. Y of NV12T to Y of YUV420S
32  * 3. UV of NV12T to UV of YUV420S
33  *
34  * @param yuv420_dest
35  *   Y or UV plane address of YUV420[out]
36  *
37  * @param nv12t_src
38  *   Y or UV plane address of NV12T[in]
39  *
40  * @param yuv420_width
41  *   Width of YUV420[in]
42  *
43  * @param yuv420_height
44  *   Y: Height of YUV420, UV: Height/2 of YUV420[in]
45  *
46  * @param left
47  *   Crop size of left. It should be even.
48  *
49  * @param top
50  *   Crop size of top. It should be even.
51  *
52  * @param right
53  *   Crop size of right. It should be even.
54  *
55  * @param buttom
56  *   Crop size of buttom. It should be even.
57  */
58
59     .arch armv7-a
60     .text
61     .global csc_tiled_to_linear_crop_neon
62     .type   csc_tiled_to_linear_crop_neon, %function
63 csc_tiled_to_linear_crop_neon:
64     .fnstart
65
66     @r0     yuv420_dest
67     @r1     nv12t_src
68     @r2     yuv420_width
69     @r3     yuv420_height
70         @r4
71     @r5     i
72     @r6     j
73     @r7     tiled_offset
74     @r8     tiled_offset1
75     @r9     linear_offset
76     @r10    temp1
77     @r11    temp2
78     @r12    temp3
79     @r14    temp4
80
81     stmfd       sp!, {r4-r12,r14}       @ backup registers
82
83     ldr         r12, [sp, #48]          @ r12 = right
84     ldr         r10, [sp, #40]          @ r10 = left
85     sub         r12, r2, r12            @ temp3 = yuv420_width-right@
86     sub         r10, r12, r10           @ temp1 = temp3-left@
87     cmp         r10, #256               @ if (temp1 >= 256)
88     blt         LOOP_HEIGHT_64_START
89
90     ldr         r5, [sp, #44]           @ i = top
91 LOOP_HEIGHT_256:
92     ldr         r6, [sp, #40]           @ j = left
93     mov         r14, r5, asr #5         @ temp4 = i>>5
94     bic         r12, r6, #0xFF          @ temp3 = (j>>8)<<8
95     mov         r12, r12, asr #6        @ temp3 = temp3>>6
96     and         r11, r14, #0x1          @ if (temp4 & 0x1)
97     cmp         r11, #0x1
98     bne         LOOP_HEIGHT_256_GET_TILED_EVEN
99 LOOP_HEIGHT_256_GET_TILED_ODD:
100     sub         r7, r14, #1             @ tiled_offset = temp4-1
101     add         r10, r2, #127           @ temp1 = ((yuv420_width+127)>>7)<<7
102     bic         r10, r10, #0x7F
103     mov         r10, r10, asr #6        @ tiled_offset = tiled_offset*(temp1>>6)
104     mul         r7, r7, r10
105     add         r7, r7, r12             @ tiled_offset = tiled_offset+temp3
106     add         r7, r7, #2              @ tiled_offset = tiled_offset+2
107     bic         r10, r12, #0x3          @ temp1 = (temp3>>2)<<2
108     add         r7, r7, r10             @ tiled_offset = tiled_offset+temp1
109     mov         r7, r7, lsl #11         @ tiled_offset = tiled_offset<<11
110     add         r8, r7, #4096           @ tiled_offset1 = tiled_offset+2048*2
111     mov         r14, #8
112     b           LOOP_HEIGHT_256_GET_TILED_END
113
114 LOOP_HEIGHT_256_GET_TILED_EVEN:
115     add         r11, r3, #31            @ temp2 = ((yuv420_height+31)>>5)<<5
116     bic         r11, r11, #0x1F
117     add         r10, r5, #32            @ if ((i+32)<temp2)
118     cmp         r10, r11
119     bge         LOOP_HEIGHT_256_GET_TILED_EVEN1
120     add         r10, r12, #2            @ temp1 = temp3+2
121     bic         r10, r10, #0x3          @ temp1 = (temp1>>2)<<2
122     add         r7, r12, r10            @ tiled_offset = temp3+temp1@
123     add         r10, r2, #127           @ temp1 = ((yuv420_width+127)>>7)<<7
124     bic         r10, r10, #0x7F
125     mov         r10, r10, asr #6        @ tiled_offset = tiled_offset+temp4*(temp1>>6)
126     mla         r7, r14, r10, r7
127     mov         r7, r7, lsl #11         @ tiled_offset = tiled_offset<<11
128     add         r8, r7, #12288          @ tiled_offset1 = tiled_offset+2048*6
129     mov         r14, #8
130     b           LOOP_HEIGHT_256_GET_TILED_END
131
132 LOOP_HEIGHT_256_GET_TILED_EVEN1:
133     add         r10, r2, #127           @ temp1 = ((yuv420_width+127)>>7)<<7
134     bic         r10, r10, #0x7F
135     mov         r10, r10, asr #6        @ tiled_offset = temp4*(temp1>>6)
136     mul         r7, r14, r10
137     add         r7, r7, r12             @ tiled_offset = tiled_offset+temp3
138     mov         r7, r7, lsl #11         @ tiled_offset = tiled_offset<<11
139     add         r8, r7, #4096           @ tiled_offset1 = tiled_offset+2048*2
140     mov         r14, #4
141
142 LOOP_HEIGHT_256_GET_TILED_END:
143
144     ldr         r12, [sp, #48]          @ right
145     ldr         r9, [sp, #44]           @ top
146     and         r10, r5, #0x1F          @ temp1 = i&0x1F
147     add         r7, r7, r10, lsl #6     @ tiled_offset = tiled_offset+64*(temp1)
148     add         r8, r8, r10, lsl #6     @ tiled_offset1 = tiled_offset1+64*(temp1)
149     sub         r11, r2, r6             @ temp2 = yuv420_width-left(==j)-right
150     sub         r11, r11, r12
151     sub         r9, r5, r9              @ linear_offset = temp2*(i-top)@
152     mul         r9, r11, r9
153     add         r12, r6, #256           @ temp3 = ((j+256)>>8)<<8@
154     bic         r12, r12, #0xFF
155     sub         r12, r12, r6            @ temp3 = temp3-j@
156     and         r10, r6, #0x3F          @ temp1 = left(==j)&0x3F
157
158     cmp         r12, #192               @ if (temp3 > 192)
159     ble         LOOP_HEIGHT_256_LEFT_192
160     add         r11, r1, r7             @ r11 = nv12t_src+tiled_offset+temp1
161     add         r11, r11, r10
162     pld         [r11]
163     add         r12, r1, r7             @ r12 = nv12t_src+tiled_offset+2048
164     pld         [r11, #32]
165     add         r12, r12, #2048
166     pld         [r12]
167     cmp         r10, #0
168     pld         [r12, #32]
169     stmnefd     sp!, {r9-r12, r14}      @ backup registers
170     rsbne       r10, r10, #64
171     blne        MEMCOPY_UNDER_64
172     ldmnefd     sp!, {r9-r12, r14}      @ restore registers
173     bne         LOOP_HEIGHT_256_LEFT_256_64
174     vld1.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset+temp1, 64}
175     vld1.8      {q2, q3}, [r11]
176     add         r11, r0, r9             @ r11 = yuv420_dest+linear_offset
177     vst1.8      {q0, q1}, [r11]!        @ store {yuv420_dest+linear_offset, 64}
178     vst1.8      {q2, q3}, [r11]!
179 LOOP_HEIGHT_256_LEFT_256_64:
180     add         r11, r1, r8             @ r11 = nv12t_src+tiled_offset1
181     pld         [r11]
182     vld1.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048, 64}
183     pld         [r11, #32]
184     vld1.8      {q6, q7}, [r12]
185     add         r12, r11, #2048         @ r12 = nv12t_src+tiled_offset1+2048
186     pld         [r12]
187     vld1.8      {q8, q9}, [r11]!        @ load {nv12t_src+tiled_offset1, 64}
188     pld         [r12, #32]
189     vld1.8      {q10, q11}, [r11]
190     vld1.8      {q12, q13}, [r12]!      @ load {nv12t_src+tiled_offset1+2048, 64}
191     vld1.8      {q14, q15}, [r12]
192
193     sub         r11, r0, r10            @ r11 = yuv420_dest+linear_offset+64-temp1
194     add         r12, r9, #64
195     add         r11, r11, r12
196
197     vst1.8      {q4, q5}, [r11]!       @ store {yuv420_dest+linear_offset+64-temp1, 64}
198     vst1.8      {q6, q7}, [r11]!
199     vst1.8      {q8, q9}, [r11]!       @ store {yuv420_dest+linear_offset+128-temp1, 64}
200     vst1.8      {q10, q11}, [r11]!
201     vst1.8      {q12, q13}, [r11]!     @ store {yuv420_dest+linear_offset+192-temp1, 64}
202     vst1.8      {q14, q15}, [r11]!
203
204     add         r9, r9, #256
205     sub         r9, r9, r10
206     b           LOOP_HEIGHT_256_LEFT_END
207
208 LOOP_HEIGHT_256_LEFT_192:
209     cmp         r12, #128               @ if (temp3 > 128)
210     ble         LOOP_HEIGHT_256_LEFT_128
211     add         r11, r1, r7             @ r11 = nv12t_src+tiled_offset+2048+temp1
212     add         r11, r11, r10
213     add         r11, r11, #2048
214     pld         [r11]
215     add         r12, r1, r8             @ r12 = nv12t_src+tiled_offset1
216     pld         [r11, #32]
217     cmp         r10, #0
218     pld         [r12]
219     stmnefd     sp!, {r9-r12, r14}      @ backup registers
220     pld         [r12, #32]
221     rsbne       r10, r10, #64
222     blne        MEMCOPY_UNDER_64
223     ldmnefd     sp!, {r9-r12, r14}      @ restore registers
224     bne         LOOP_HEIGHT_256_LEFT_192_64
225     vld1.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset+2048+temp1, 64}
226     vld1.8      {q2, q3}, [r11]
227     add         r11, r0, r9             @ r11 = yuv420_dest+linear_offset
228     vst1.8      {q0, q1}, [r11]!        @ store {yuv420_dest+linear_offset, 64}
229     vst1.8      {q2, q3}, [r11]!
230 LOOP_HEIGHT_256_LEFT_192_64:
231     add         r11, r1, r8             @ r11 = nv12t_src+tiled_offset1+2048
232     add         r11, r11, #2048
233     pld         [r11]
234     vld1.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset1, 64}
235     pld         [r11, #32]
236     vld1.8      {q6, q7}, [r12]
237     vld1.8      {q8, q9}, [r11]!        @ load {nv12t_src+tiled_offset1+2048, 64}
238     vld1.8      {q10, q11}, [r11]
239
240     sub         r11, r0, r10            @ r11 = yuv420_dest+linear_offset+64-temp1
241     add         r12, r9, #64
242     add         r11, r11, r12
243
244     vst1.8      {q4, q5}, [r11]!        @ store {yuv420_dest+linear_offset+64-temp1, 64}
245     vst1.8      {q6, q7}, [r11]!
246     vst1.8      {q8, q9}, [r11]!        @ store {yuv420_dest+linear_offset+128-temp1, 64}
247     vst1.8      {q10, q11}, [r11]!
248
249     add         r9, r9, #192
250     sub         r9, r9, r10
251     b           LOOP_HEIGHT_256_LEFT_END
252
253 LOOP_HEIGHT_256_LEFT_128:
254     cmp         r12, #64                @ if (temp3 > 64)
255     ble         LOOP_HEIGHT_256_LEFT_64
256     add         r11, r1, r8             @ r11 = nv12t_src+tiled_offset1+temp1
257     add         r11, r11, r10
258     pld         [r11]
259     add         r12, r1, r8             @ r12 = nv12t_src+tiled_offset1
260     add         r12, r12, #2048
261     pld         [r11, #32]
262     cmp         r10, #0
263     pld         [r12]
264     stmnefd     sp!, {r9-r12, r14}      @ backup registers
265     pld         [r12, #32]
266     rsbne       r10, r10, #64
267     blne        MEMCOPY_UNDER_64
268     ldmnefd     sp!, {r9-r12, r14}      @ restore registers
269     bne         LOOP_HEIGHT_256_LEFT_128_64
270     vld1.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset1+temp1, 64}
271     vld1.8      {q2, q3}, [r11]
272     add         r11, r0, r9             @ r11 = yuv420_dest+linear_offset
273     vst1.8      {q0, q1}, [r11]!        @ store {yuv420_dest+linear_offset, 64}
274     vst1.8      {q2, q3}, [r11]!
275 LOOP_HEIGHT_256_LEFT_128_64:
276     vld1.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset1, 64}
277     vld1.8      {q6, q7}, [r12]
278
279     sub         r11, r0, r10            @ r11 = yuv420_dest+linear_offset+64-temp1
280     add         r12, r9, #64
281     add         r11, r11, r12
282
283     vst1.8      {q4, q5}, [r11]!        @ store {yuv420_dest+linear_offset+64-temp1, 64}
284     vst1.8      {q6, q7}, [r11]!
285
286     add         r9, r9, #128
287     sub         r9, r9, r10
288     b           LOOP_HEIGHT_256_LEFT_END
289
290 LOOP_HEIGHT_256_LEFT_64:
291     add         r11, r1, r8             @ r11 = nv12t_src+tiled_offset1+2048+temp1
292     add         r11, r11, #2048
293     add         r11, r11, r10
294     cmp         r10, #0
295     pld         [r11]
296     stmnefd     sp!, {r9-r12, r14}      @ backup registers
297     pld         [r11, #32]
298     rsbne       r10, r10, #64
299     blne        MEMCOPY_UNDER_64
300     ldmnefd     sp!, {r9-r12, r14}      @ restore registers
301     bne         LOOP_HEIGHT_256_LEFT_64_64
302     vld1.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset1+temp1, 64}
303     vld1.8      {q2, q3}, [r11]
304     add         r11, r0, r9             @ r11 = yuv420_dest+linear_offset
305     vst1.8      {q0, q1}, [r11]!        @ store {yuv420_dest+linear_offset, 64}
306     vst1.8      {q2, q3}, [r11]!
307 LOOP_HEIGHT_256_LEFT_64_64:
308     add         r9, r9, #64
309     sub         r9, r9, r10
310
311 LOOP_HEIGHT_256_LEFT_END:
312
313     ldr         r12, [sp, #48]          @ right
314     add         r7, r7, r14, lsl #11    @ tiled_offset = tiled_offset+temp4*2048
315     add         r10, r1, r7             @ r10 = nv12t_src+tiled_offset
316     pld         [r10]
317     bic         r6, r6, #0xFF           @ j = (left>>8)<<8
318     pld         [r10, #32]
319     add         r6, r6, #256            @ j = j + 256
320     sub         r11, r2, r12            @ temp2 = yuv420_width-right-256
321     sub         r11, r11, #256
322     cmp         r6, r11
323     bgt         LOOP_HEIGHT_256_WIDTH_END
324
325 LOOP_HEIGHT_256_WIDTH:
326     add         r12, r10, #2048         @ r12 = nv12t_src+tiled_offset+2048
327     pld         [r12]
328     vld1.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset, 64}
329     pld         [r12, #32]
330     vld1.8      {q2, q3}, [r10]
331
332     add         r8, r8, r14, lsl #11    @ tiled_offset1 = tiled_offset1+temp4*2048
333     add         r10, r1, r8             @ r10 = nv12t_src+tiled_offset1
334     pld         [r10]
335     vld1.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048, 64}
336     pld         [r10, #32]
337     vld1.8      {q6, q7}, [r12]
338
339     add         r12, r10, #2048         @ r12 = nv12t_src+tiled_offset+2048
340     pld         [r12]
341     vld1.8      {q8, q9}, [r10]!        @ load {nv12t_src+tiled_offset+2048, 64}
342     pld         [r12, #32]
343     vld1.8      {q10, q11}, [r10]
344
345     add         r7, r7, r14, lsl #11    @ tiled_offset = tiled_offset+temp4*2048
346     add         r10, r1, r7
347     pld         [r10]
348     vld1.8      {q12, q13}, [r12]!      @ load {nv12t_src+tiled_offset+2048, 64}
349     pld         [r10, #32]
350     vld1.8      {q14, q15}, [r12]
351
352     add         r12, r0, r9             @ r12 = yuv420_dest+linear_offset
353     vst1.8      {q0, q1}, [r12]!
354     vst1.8      {q2, q3}, [r12]!
355     vst1.8      {q4, q5}, [r12]!
356     vst1.8      {q6, q7}, [r12]!
357     vst1.8      {q8, q9}, [r12]!
358     vst1.8      {q10, q11}, [r12]!
359     vst1.8      {q12, q13}, [r12]!
360     vst1.8      {q14, q15}, [r12]!
361     add         r9, r9, #256            @ linear_offset = linear_offset+256
362
363     add         r12, r10, #2048         @ r12 = nv12t_src+tiled_offset+2048
364
365     add         r6, r6, #256            @ j=j+256
366     cmp         r6, r11                 @ j<=temp2
367     ble         LOOP_HEIGHT_256_WIDTH
368
369 LOOP_HEIGHT_256_WIDTH_END:
370
371     add         r8, r8, r14, lsl #11    @ tiled_offset1 = tiled_offset1+temp4*2048
372     ldr         r14, [sp, #48]          @ right
373     sub         r11, r2, r6             @ temp2 = yuv420_width-right-j
374     sub         r11, r11, r14
375     cmp         r11, #0
376     beq         LOOP_HEIGHT_256_RIGHT_END
377     cmp         r11, #192
378     ble         LOOP_HEIGHT_256_RIGHT_192
379     add         r12, r10, #2048
380     pld         [r12]
381     vld1.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset}
382     pld         [r12, #32]
383     vld1.8      {q2, q3}, [r10]
384
385     add         r10, r1, r8             @ r10 = nv12t_src+tiled_offset1
386     pld         [r10]
387     vld1.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048}
388     pld         [r10, #32]
389     vld1.8      {q6, q7}, [r12]
390
391     add         r14, r10, #2048         @ r10 = nv12t_src+tiled_offset1+2048
392     pld         [r14]
393     vld1.8      {q8, q9}, [r10]!        @ load {nv12t_src+tiled_offset1}
394     pld         [r14, #32]
395     vld1.8      {q10, q11}, [r10]
396
397     add         r12, r0, r9             @ r12 = yuv420_dest+linear_offset
398     vst1.8      {q0, q1}, [r12]!
399     vst1.8      {q2, q3}, [r12]!
400     vst1.8      {q4, q5}, [r12]!
401     vst1.8      {q6, q7}, [r12]!
402     vst1.8      {q8, q9}, [r12]!
403     vst1.8      {q10, q11}, [r12]!
404     add         r9, r9, #192            @ linear_offset = linear_offset+192
405
406     stmfd       sp!, {r9-r12, r14}      @ backup registers
407     sub         r10, r11, #192
408     mov         r11, r14
409     bl          MEMCOPY_UNDER_64
410     ldmfd       sp!, {r9-r12, r14}      @ restore registers
411     b           LOOP_HEIGHT_256_RIGHT_END
412
413 LOOP_HEIGHT_256_RIGHT_192:
414     cmp         r11, #128
415     ble         LOOP_HEIGHT_256_RIGHT_128
416     add         r12, r10, #2048
417     pld         [r12]
418     vld1.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset}
419     pld         [r12, #32]
420     vld1.8      {q2, q3}, [r10]
421
422     add         r14, r1, r8             @ r10 = nv12t_src+tiled_offset1
423     pld         [r14]
424     vld1.8      {q4, q5}, [r12]!        @ load {nv12t_src+tiled_offset+2048}
425     pld         [r14, #32]
426     vld1.8      {q6, q7}, [r12]
427
428     add         r12, r0, r9             @ r12 = yuv420_dest+linear_offset
429     vst1.8      {q0, q1}, [r12]!
430     vst1.8      {q2, q3}, [r12]!
431     vst1.8      {q4, q5}, [r12]!
432     vst1.8      {q6, q7}, [r12]!
433     add         r9, r9, #128            @ linear_offset = linear_offset+128
434
435     stmfd       sp!, {r9-r12, r14}      @ backup registers
436     sub         r10, r11, #128
437     mov         r11, r14
438     bl          MEMCOPY_UNDER_64
439     ldmfd       sp!, {r9-r12, r14}      @ restore registers
440     b           LOOP_HEIGHT_256_RIGHT_END
441
442 LOOP_HEIGHT_256_RIGHT_128:
443     cmp         r11, #64
444     ble         LOOP_HEIGHT_256_RIGHT_64
445     add         r14, r10, #2048
446     pld         [r14]
447     vld1.8      {q0, q1}, [r10]!        @ load {nv12t_src+tiled_offset}
448     pld         [r14, #32]
449     vld1.8      {q2, q3}, [r10]
450
451     add         r12, r0, r9             @ r12 = yuv420_dest+linear_offset
452     vst1.8      {q0, q1}, [r12]!
453     vst1.8      {q2, q3}, [r12]!
454     add         r9, r9, #64            @ linear_offset = linear_offset+64
455
456     stmfd       sp!, {r9-r12, r14}      @ backup registers
457     sub         r10, r11, #64
458     mov         r11, r14
459     bl          MEMCOPY_UNDER_64
460     ldmfd       sp!, {r9-r12, r14}      @ restore registers
461     b           LOOP_HEIGHT_256_RIGHT_END
462
463 LOOP_HEIGHT_256_RIGHT_64:
464     stmfd       sp!, {r9-r12, r14}      @ backup registers
465     mov         r14, r11
466     mov         r11, r10
467     mov         r10, r14
468     bl          MEMCOPY_UNDER_64
469     ldmfd       sp!, {r9-r12, r14}      @ restore registers
470
471 LOOP_HEIGHT_256_RIGHT_END:
472
473     ldr         r14, [sp, #52]          @ buttom
474     add         r5, r5, #1              @ i=i+1
475     sub         r14, r3, r14            @ i<yuv420_height-buttom
476     cmp         r5, r14
477     blt         LOOP_HEIGHT_256
478     b           RESTORE_REG
479
480 LOOP_HEIGHT_64_START:
481     cmp         r10, #64               @ if (temp1 >= 64)
482     blt         LOOP_HEIGHT_2_START
483
484     ldr         r5, [sp, #44]           @ i = top
485 LOOP_HEIGHT_64:
486     ldr         r6, [sp, #40]           @ j = left
487     stmfd       sp!, {r0-r3, r12}       @ backup parameters
488     mov         r0, r2
489     mov         r1, r3
490     mov         r2, r6
491     mov         r3, r5
492     bl          tile_4x2_read_asm
493     mov         r7, r0
494     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
495     ldr         r9, [sp, #44]           @ linear_offset = top
496     add         r11, r6, #64            @ temp2 = ((j+64)>>6)<<6
497     bic         r11, r11, #0x3F
498     sub         r11, r11, r6            @ temp2 = temp2-j
499     sub         r9, r5, r9              @ linear_offset = temp1*(i-top)
500     mul         r9, r9, r10
501     and         r14, r6, #0x3           @ temp4 = j&0x3
502     add         r7, r7, r14             @ tiled_offset = tiled_offset+temp4
503     stmfd       sp!, {r9-r12}           @ backup parameters
504     mov         r10, r11
505     add         r11, r1, r7
506     bl          MEMCOPY_UNDER_64
507     ldmfd       sp!, {r9-r12}           @ restore parameters
508     add         r9, r9, r11             @ linear_offset = linear_offset+temp2
509     add         r6, r6, r11             @ j = j+temp2@
510
511     add         r14, r6, #64
512     cmp         r14, r12
513     bgt         LOOP_HEIGHT_64_1
514     stmfd       sp!, {r0-r3, r12}       @ backup parameters
515     mov         r0, r2
516     mov         r1, r3
517     mov         r2, r6
518     mov         r3, r5
519     bl          tile_4x2_read_asm
520     mov         r7, r0
521     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
522     add         r7, r1, r7
523     vld1.8      {q0, q1}, [r7]!
524     vld1.8      {q2, q3}, [r7]
525     add         r7, r0, r9
526     vst1.8      {q0, q1}, [r7]!
527     vst1.8      {q2, q3}, [r7]
528     add         r9, r9, #64
529     add         r6, r6, #64
530
531 LOOP_HEIGHT_64_1:
532     add         r14, r6, #64
533     cmp         r14, r12
534     bgt         LOOP_HEIGHT_64_2
535     stmfd       sp!, {r0-r3, r12}       @ backup parameters
536     mov         r0, r2
537     mov         r1, r3
538     mov         r2, r6
539     mov         r3, r5
540     bl          tile_4x2_read_asm
541     mov         r7, r0
542     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
543     add         r7, r1, r7
544     vld1.8      {q0, q1}, [r7]!
545     vld1.8      {q2, q3}, [r7]
546     add         r7, r0, r9
547     vst1.8      {q0, q1}, [r7]!
548     vst1.8      {q2, q3}, [r7]
549     add         r9, r9, #64
550     add         r6, r6, #64
551
552 LOOP_HEIGHT_64_2:
553     cmp         r6, r12
554     bge         LOOP_HEIGHT_64_3
555     stmfd       sp!, {r0-r3, r12}       @ backup parameters
556     mov         r0, r2
557     mov         r1, r3
558     mov         r2, r6
559     mov         r3, r5
560     bl          tile_4x2_read_asm
561     mov         r7, r0
562     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
563     sub         r11, r12, r6
564     stmfd       sp!, {r9-r12}           @ backup parameters
565     mov         r10, r11
566     add         r11, r1, r7
567     bl          MEMCOPY_UNDER_64
568     ldmfd       sp!, {r9-r12}           @ restore parameters
569
570 LOOP_HEIGHT_64_3:
571
572     ldr         r14, [sp, #52]          @ buttom
573     add         r5, r5, #1              @ i=i+1
574     sub         r14, r3, r14            @ i<yuv420_height-buttom
575     cmp         r5, r14
576     blt         LOOP_HEIGHT_64
577     b           RESTORE_REG
578
579 LOOP_HEIGHT_2_START:
580
581     ldr         r5, [sp, #44]           @ i = top
582 LOOP_HEIGHT_2:
583
584     ldr         r6, [sp, #40]           @ j = left
585     ldr         r9, [sp, #44]           @ linear_offset = top
586     add         r11, r6, #64            @ temp2 = ((j+64)>>6)<<6
587     bic         r11, r11, #0x3F
588     sub         r11, r11, r6            @ temp2 = temp2-j
589     sub         r9, r5, r9              @ linear_offset = temp1*(i-top)
590     mul         r9, r10, r9
591     add         r9, r0, r9              @ linear_offset = linear_dst+linear_offset
592 LOOP_HEIGHT_2_WIDTH:
593     stmfd       sp!, {r0-r3, r12}       @ backup parameters
594     mov         r0, r2
595     mov         r1, r3
596     mov         r2, r6
597     mov         r3, r5
598     bl          tile_4x2_read_asm
599     mov         r7, r0
600     ldmfd       sp!, {r0-r3, r12}       @ restore parameters
601
602     and         r14, r6, #0x3           @ temp4 = j&0x3@
603     add         r7, r7, r14             @ tiled_offset = tiled_offset+temp4@
604     add         r7, r1, r7
605
606     ldrh        r14, [r7]
607     strh        r14, [r9], #2
608
609     ldr         r14, [sp, #48]          @ right
610     add         r6, r6, #2              @ j=j+2
611     sub         r14, r2, r14            @ j<yuv420_width-right
612     cmp         r6, r14
613     blt         LOOP_HEIGHT_2_WIDTH
614
615     ldr         r14, [sp, #52]          @ buttom
616     add         r5, r5, #1              @ i=i+1
617     sub         r14, r3, r14            @ i<yuv420_height-buttom
618     cmp         r5, r14
619     blt         LOOP_HEIGHT_2
620
621 RESTORE_REG:
622     ldmfd       sp!, {r4-r12,r15}       @ restore registers
623
624 MEMCOPY_UNDER_64:                       @ count=r10, src=r11
625     cmp         r10, #32
626     add         r9, r0, r9              @ r9 = yuv420_dest+linear_offset
627     blt         MEMCOPY_UNDER_32
628     vld1.8      {q0, q1}, [r11]!        @ load {nv12t_src+tiled_offset+temp1, 64}
629     sub         r10, r10, #32
630     cmp         r10, #0
631     vst1.8      {q0, q1}, [r9]!         @ load {nv12t_src+tiled_offset+temp1, 64}
632     beq         MEMCOPY_UNDER_END
633 MEMCOPY_UNDER_32:
634     cmp         r10, #16
635     blt         MEMCOPY_UNDER_16
636     vld1.8      {q0}, [r11]!            @ load {nv12t_src+tiled_offset+temp1, 64}
637     sub         r10, r10, #16
638     cmp         r10, #0
639     vst1.8      {q0}, [r9]!             @ load {nv12t_src+tiled_offset+temp1, 64}
640     beq         MEMCOPY_UNDER_END
641 MEMCOPY_UNDER_16:
642     ldrb        r12, [r11], #1
643     strb        r12, [r9], #1
644     subs        r10, r10, #1
645     bne         MEMCOPY_UNDER_16
646
647 MEMCOPY_UNDER_END:
648     and         r10, r6, #0x3F          @ temp1 = left(==j)&0x3F
649     cmp         r10, #0
650     mov         pc, lr
651
652 tile_4x2_read_asm:
653 LFB0:
654     add     ip, r3, #32
655     sub     r0, r0, #1
656     cmp     r1, ip
657     cmple   r3, r1
658     mov     ip, r2, asr #2
659     mov     r0, r0, asr #7
660     stmfd   sp!, {r4, r5, lr}
661 LCFI0:
662     add     r0, r0, #1
663     bge     L2
664     sub     r1, r1, #1
665     tst     r1, #32
666     bne     L2
667     tst     r3, #32
668     bne     L2
669     mov     r4, r2, asr #7
670     and     r1, r3, #31
671     eor     r4, r4, r3, asr #5
672     ubfx    r3, r3, #6, #8
673     tst     r4, #1
674     ubfx    r4, r2, #8, #6
675     and     ip, ip, #15
676     mov     r2, r2, asr #6
677     mla     r3, r0, r3, r4
678     orr     r1, ip, r1, asl #4
679     b       L9
680 L2:
681     mov     r2, ip, asr #5
682     and     r4, r3, #31
683     eor     r1, r2, r3, asr #5
684     and     r5, r2, #127
685     ubfx    r3, r3, #6, #8
686     tst     r1, #1
687     and     r1, ip, #15
688     mov     r2, ip, asr #4
689     mla     r3, r0, r3, r5
690     orr     r1, r1, r4, asl #4
691 L9:
692     andne   r2, r2, #1
693     andeq   r2, r2, #1
694     orrne   r2, r2, #2
695     mov     r1, r1, asl #2
696     orr     r3, r1, r3, asl #13
697     orr     r0, r3, r2, asl #11
698     ldmfd   sp!, {r4, r5, pc}
699 LFE0:
700     .fnend
701