3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
19 * @file csc_tiled_to_linear_deinterleave_crop_neon.s
20 * @brief SEC_OMX specific define
21 * @author ShinWon Lee (shinwon.lee@samsung.com)
28 * Converts and Deinterleaves tiled data to linear
29 * Crops left, top, right, buttom
30 * 1. UV of NV12T to UV of YUV420P
32 * @param yuv420_u_dest
33 * U plane address of YUV420P[out]
35 * @param yuv420_v_dest
36 * V plane address of YUV420P[out]
39 * UV plane address of NV12T[in]
44 * @param yuv420_uv_height
45 * Height/2 of YUV420[in]
48 * Crop size of left. It should be even.
51 * Crop size of top. It should be even.
54 * Crop size of right. It should be even.
57 * Crop size of buttom. It should be even.
62 .global csc_tiled_to_linear_deinterleave_crop_neon
63 .type csc_tiled_to_linear_deinterleave_crop_neon, %function
64 csc_tiled_to_linear_deinterleave_crop_neon:
82 stmfd sp!, {r4-r12,r14} @ backup registers
84 ldr r4, [sp, #40] @ r4 = yuv420_height
86 ldr r12, [sp, #52] @ r12 = right
87 ldr r10, [sp, #44] @ r10 = left
88 sub r12, r3, r12 @ temp3 = yuv420_width-right@
89 sub r10, r12, r10 @ temp1 = temp3-left@
90 cmp r10, #256 @ if (temp1 >= 256)
91 blt LOOP_HEIGHT_64_START
93 ldr r5, [sp, #48] @ top
95 ldr r6, [sp, #44] @ j = left
96 mov r14, r5, asr #5 @ temp4 = i>>5
97 bic r12, r6, #0xFF @ temp3 = (j>>8)<<8
98 mov r12, r12, asr #6 @ temp3 = temp3>>6
99 and r11, r14, #0x1 @ if (temp4 & 0x1)
101 bne LOOP_HEIGHT_256_GET_TILED_EVEN
102 LOOP_HEIGHT_256_GET_TILED_ODD:
103 sub r7, r14, #1 @ tiled_offset = temp4-1
104 add r10, r3, #127 @ temp1 = ((yuv420_width+127)>>7)<<7
106 mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)
108 add r7, r7, r12 @ tiled_offset = tiled_offset+temp3
109 add r7, r7, #2 @ tiled_offset = tiled_offset+2
110 bic r10, r12, #0x3 @ temp1 = (temp3>>2)<<2
111 add r7, r7, r10 @ tiled_offset = tiled_offset+temp1
112 mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11
113 add r8, r7, #4096 @ tiled_offset1 = tiled_offset+2048*2
115 b LOOP_HEIGHT_256_GET_TILED_END
117 LOOP_HEIGHT_256_GET_TILED_EVEN:
118 add r11, r4, #31 @ temp2 = ((yuv420_height+31)>>5)<<5
120 add r10, r5, #32 @ if ((i+32)<temp2)
122 bge LOOP_HEIGHT_256_GET_TILED_EVEN1
123 add r10, r12, #2 @ temp1 = temp3+2
124 bic r10, r10, #0x3 @ temp1 = (temp1>>2)<<2
125 add r7, r12, r10 @ tiled_offset = temp3+temp1@
126 add r10, r3, #127 @ temp1 = ((yuv420_width+127)>>7)<<7
128 mov r10, r10, asr #6 @ tiled_offset = tiled_offset+temp4*(temp1>>6)
130 mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11
131 add r8, r7, #12288 @ tiled_offset1 = tiled_offset+2048*6
133 b LOOP_HEIGHT_256_GET_TILED_END
135 LOOP_HEIGHT_256_GET_TILED_EVEN1:
136 add r10, r3, #127 @ temp1 = ((yuv420_width+127)>>7)<<7
138 mov r10, r10, asr #6 @ tiled_offset = temp4*(temp1>>6)
140 add r7, r7, r12 @ tiled_offset = tiled_offset+temp3
141 mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11
142 add r8, r7, #4096 @ tiled_offset1 = tiled_offset+2048*2
145 LOOP_HEIGHT_256_GET_TILED_END:
147 ldr r12, [sp, #52] @ right
148 ldr r9, [sp, #48] @ top
149 and r10, r5, #0x1F @ temp1 = i&0x1F
150 add r7, r7, r10, lsl #6 @ tiled_offset = tiled_offset+64*(temp1)
151 add r8, r8, r10, lsl #6 @ tiled_offset1 = tiled_offset1+64*(temp1)
152 sub r11, r3, r6 @ temp2 = yuv420_width-left(==j)-right
154 sub r9, r5, r9 @ linear_offset = temp2*(i-top)/2@
157 add r12, r6, #256 @ temp3 = ((j+256)>>8)<<8@
159 sub r12, r12, r6 @ temp3 = temp3-j@
160 and r10, r6, #0x3F @ temp1 = left(==j)&0x3F
162 cmp r12, #192 @ if (temp3 > 192)
163 ble LOOP_HEIGHT_256_LEFT_192
164 add r11, r2, r7 @ r11 = nv12t_src+tiled_offset+temp1
167 add r12, r2, r7 @ r12 = nv12t_src+tiled_offset+2048
173 stmnefd sp!, {r8-r12, r14} @ backup registers
175 blne INTERLEAVED_MEMCOPY_UNDER_64
176 ldmnefd sp!, {r8-r12, r14} @ restore registers
177 bne LOOP_HEIGHT_256_LEFT_256_64
178 vld2.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset+temp1, 64}
179 vld2.8 {q2, q3}, [r11]
180 add r11, r0, r9 @ r11 = yuv420_u_dest+linear_offset
183 add r11, r1, r9 @ r11 = yuv420_v_dest+linear_offset
186 LOOP_HEIGHT_256_LEFT_256_64:
187 add r11, r2, r8 @ r11 = nv12t_src+tiled_offset1
189 vld2.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048, 64}
191 vld2.8 {q6, q7}, [r12]
192 add r12, r11, #2048 @ r12 = nv12t_src+tiled_offset1+2048
194 vld2.8 {q8, q9}, [r11]! @ load {nv12t_src+tiled_offset1, 64}
196 vld2.8 {q10, q11}, [r11]
197 vld2.8 {q12, q13}, [r12]! @ load {nv12t_src+tiled_offset1+2048, 64}
198 vld2.8 {q14, q15}, [r12]
200 add r11, r0, r9 @ r11 = yuv420_u_dest+linear_offset+32-temp1/2
202 sub r11, r11, r10, asr #1
210 add r11, r1, r9 @ r11 = yuv420_v_dest+linear_offset+32-temp1/2
212 sub r11, r11, r10, asr #1
221 sub r9, r9, r10, asr #1
222 b LOOP_HEIGHT_256_LEFT_END
224 LOOP_HEIGHT_256_LEFT_192:
225 cmp r12, #128 @ if (temp3 > 128)
226 ble LOOP_HEIGHT_256_LEFT_128
227 add r11, r2, r7 @ r11 = nv12t_src+tiled_offset+2048+temp1
231 add r12, r2, r8 @ r12 = nv12t_src+tiled_offset1
235 stmnefd sp!, {r8-r12, r14} @ backup registers
238 blne INTERLEAVED_MEMCOPY_UNDER_64
239 ldmnefd sp!, {r8-r12, r14} @ restore registers
240 bne LOOP_HEIGHT_256_LEFT_192_64
241 vld2.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset+2048+temp1, 64}
242 vld2.8 {q2, q3}, [r11]
243 add r11, r0, r9 @ r11 = yuv420_u_dest+linear_offset
246 add r11, r1, r9 @ r11 = yuv420_v_dest+linear_offset
249 LOOP_HEIGHT_256_LEFT_192_64:
250 add r11, r2, r8 @ r11 = nv12t_src+tiled_offset1+2048
253 vld2.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset1, 64}
255 vld2.8 {q6, q7}, [r12]
256 vld2.8 {q8, q9}, [r11]! @ load {nv12t_src+tiled_offset1+2048, 64}
257 vld2.8 {q10, q11}, [r11]
259 add r11, r0, r9 @ r11 = yuv420_u_dest+linear_offset+32-temp1/2
261 sub r11, r11, r10, asr #1
267 add r11, r1, r9 @ r11 = yuv420_v_dest+linear_offset+32-temp1/2
269 sub r11, r11, r10, asr #1
276 sub r9, r9, r10, asr #1
277 b LOOP_HEIGHT_256_LEFT_END
279 LOOP_HEIGHT_256_LEFT_128:
280 cmp r12, #64 @ if (temp3 > 64)
281 ble LOOP_HEIGHT_256_LEFT_64
282 add r11, r2, r8 @ r11 = nv12t_src+tiled_offset1+temp1
285 add r12, r2, r8 @ r12 = nv12t_src+tiled_offset1
290 stmnefd sp!, {r8-r12, r14} @ backup registers
293 blne INTERLEAVED_MEMCOPY_UNDER_64
294 ldmnefd sp!, {r8-r12, r14} @ restore registers
295 bne LOOP_HEIGHT_256_LEFT_128_64
296 vld2.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset1+temp1, 64}
297 vld2.8 {q2, q3}, [r11]
298 add r11, r0, r9 @ r11 = yuv420_u_dest+linear_offset
301 add r11, r1, r9 @ r11 = yuv420_v_dest+linear_offset
304 LOOP_HEIGHT_256_LEFT_128_64:
305 vld2.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset1, 64}
306 vld2.8 {q6, q7}, [r12]
308 add r11, r0, r9 @ r11 = yuv420_u_dest+linear_offset+32-temp1/2
310 sub r11, r11, r10, asr #1
314 add r11, r1, r9 @ r11 = yuv420_v_dest+linear_offset+32-temp1/2
316 sub r11, r11, r10, asr #1
321 sub r9, r9, r10, asr #1
322 b LOOP_HEIGHT_256_LEFT_END
324 LOOP_HEIGHT_256_LEFT_64:
325 add r11, r2, r8 @ r11 = nv12t_src+tiled_offset1+2048+temp1
330 stmnefd sp!, {r8-r12, r14} @ backup registers
333 blne INTERLEAVED_MEMCOPY_UNDER_64
334 ldmnefd sp!, {r8-r12, r14} @ restore registers
335 bne LOOP_HEIGHT_256_LEFT_64_64
336 vld2.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset1+temp1, 64}
337 vld2.8 {q2, q3}, [r11]
338 add r11, r0, r9 @ r11 = yuv420_dest+linear_offset
339 vst1.8 {q0, q1}, [r11]! @ store {yuv420_dest+linear_offset, 64}
340 vst1.8 {q2, q3}, [r11]!
341 LOOP_HEIGHT_256_LEFT_64_64:
343 sub r9, r9, r10, asr #1
345 LOOP_HEIGHT_256_LEFT_END:
347 ldr r12, [sp, #52] @ right
348 add r7, r7, r14, lsl #11 @ tiled_offset = tiled_offset+temp4*2048
349 add r10, r2, r7 @ r10 = nv12t_src+tiled_offset
351 bic r6, r6, #0xFF @ j = (left>>8)<<8
353 add r6, r6, #256 @ j = j + 256
354 sub r11, r3, r12 @ temp2 = yuv420_width-right-256
357 bgt LOOP_HEIGHT_256_WIDTH_END
359 LOOP_HEIGHT_256_WIDTH:
360 add r12, r10, #2048 @ r12 = nv12t_src+tiled_offset+2048
362 vld2.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset, 64}
364 vld2.8 {q2, q3}, [r10]
366 add r8, r8, r14, lsl #11 @ tiled_offset1 = tiled_offset1+temp4*2048
367 add r10, r2, r8 @ r10 = nv12t_src+tiled_offset1
369 vld2.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048, 64}
371 vld2.8 {q6, q7}, [r12]
373 add r12, r10, #2048 @ r12 = nv12t_src+tiled_offset+2048
375 vld2.8 {q8, q9}, [r10]! @ load {nv12t_src+tiled_offset+2048, 64}
377 vld2.8 {q10, q11}, [r10]
379 add r7, r7, r14, lsl #11 @ tiled_offset = tiled_offset+temp4*2048
382 vld2.8 {q12, q13}, [r12]! @ load {nv12t_src+tiled_offset+2048, 64}
384 vld2.8 {q14, q15}, [r12]
386 add r12, r0, r9 @ r12 = yuv420_u_dest+linear_offset
395 add r12, r1, r9 @ r12 = yuv420_v_dest+linear_offset
404 add r9, r9, #128 @ linear_offset = linear_offset+128
406 add r12, r10, #2048 @ r12 = nv12t_src+tiled_offset+2048
408 add r6, r6, #256 @ j=j+256
409 cmp r6, r11 @ j<=temp2
410 ble LOOP_HEIGHT_256_WIDTH
412 LOOP_HEIGHT_256_WIDTH_END:
414 add r8, r8, r14, lsl #11 @ tiled_offset1 = tiled_offset1+temp4*2048
415 ldr r14, [sp, #52] @ right
416 sub r11, r3, r6 @ temp2 = yuv420_width-right-j
419 beq LOOP_HEIGHT_256_RIGHT_END
421 ble LOOP_HEIGHT_256_RIGHT_192
424 vld2.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset}
426 vld2.8 {q2, q3}, [r10]
428 add r10, r2, r8 @ r10 = nv12t_src+tiled_offset1
430 vld2.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048}
432 vld2.8 {q6, q7}, [r12]
434 add r14, r10, #2048 @ r10 = nv12t_src+tiled_offset1+2048
436 vld2.8 {q8, q9}, [r10]! @ load {nv12t_src+tiled_offset1}
438 vld2.8 {q10, q11}, [r10]
440 add r12, r0, r9 @ r12 = yuv420_u_dest+linear_offset
447 add r12, r1, r9 @ r12 = yuv420_v_dest+linear_offset
454 add r9, r9, #96 @ linear_offset = linear_offset+96
456 stmfd sp!, {r8-r12, r14} @ backup registers
459 bl INTERLEAVED_MEMCOPY_UNDER_64
460 ldmfd sp!, {r8-r12, r14} @ restore registers
461 b LOOP_HEIGHT_256_RIGHT_END
463 LOOP_HEIGHT_256_RIGHT_192:
465 ble LOOP_HEIGHT_256_RIGHT_128
468 vld2.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset}
470 vld2.8 {q2, q3}, [r10]
472 add r14, r2, r8 @ r10 = nv12t_src+tiled_offset1
474 vld2.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048}
476 vld2.8 {q6, q7}, [r12]
478 add r12, r0, r9 @ r12 = yuv420_u_dest+linear_offset
483 add r12, r1, r9 @ r12 = yuv420_v_dest+linear_offset
488 add r9, r9, #64 @ linear_offset = linear_offset+64
490 stmfd sp!, {r8-r12, r14} @ backup registers
493 bl INTERLEAVED_MEMCOPY_UNDER_64
494 ldmfd sp!, {r8-r12, r14} @ restore registers
495 b LOOP_HEIGHT_256_RIGHT_END
497 LOOP_HEIGHT_256_RIGHT_128:
499 ble LOOP_HEIGHT_256_RIGHT_64
502 vld2.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset}
504 vld2.8 {q2, q3}, [r10]
506 add r12, r0, r9 @ r12 = yuv420_u_dest+linear_offset
509 add r12, r1, r9 @ r12 = yuv420_v_dest+linear_offset
512 add r9, r9, #32 @ linear_offset = linear_offset+32
514 stmfd sp!, {r8-r12, r14} @ backup registers
517 bl INTERLEAVED_MEMCOPY_UNDER_64
518 ldmfd sp!, {r8-r12, r14} @ restore registers
519 b LOOP_HEIGHT_256_RIGHT_END
521 LOOP_HEIGHT_256_RIGHT_64:
522 stmfd sp!, {r8-r12, r14} @ backup registers
526 bl INTERLEAVED_MEMCOPY_UNDER_64
527 ldmfd sp!, {r8-r12, r14} @ restore registers
529 LOOP_HEIGHT_256_RIGHT_END:
531 ldr r14, [sp, #56] @ buttom
532 add r5, r5, #1 @ i=i+1
533 sub r14, r4, r14 @ i<yuv420_height-buttom
538 LOOP_HEIGHT_64_START:
539 cmp r10, #64 @ if (temp1 >= 64)
540 blt LOOP_HEIGHT_2_START
542 ldr r5, [sp, #48] @ i = top
544 ldr r6, [sp, #44] @ j = left
545 stmfd sp!, {r0-r3, r12} @ backup parameters
552 ldmfd sp!, {r0-r3, r12} @ restore parameters
553 ldr r9, [sp, #48] @ linear_offset = top
554 ldr r12, [sp, #52] @ r12 = right
555 add r11, r6, #64 @ temp2 = ((j+64)>>6)<<6
557 sub r11, r11, r6 @ temp2 = temp2-j
558 sub r12, r3, r12 @ temp3 = yuv420_width-right
559 sub r14, r12, r6 @ temp4 = temp3-left
560 sub r9, r5, r9 @ linear_offset = temp4*(i-top)/2
563 and r14, r6, #0x3 @ temp4 = j&0x3
564 add r7, r7, r14 @ tiled_offset = tiled_offset+temp4
565 stmfd sp!, {r9-r12} @ backup parameters
568 bl INTERLEAVED_MEMCOPY_UNDER_64
569 ldmfd sp!, {r9-r12} @ restore parameters
570 add r9, r9, r11, asr #1 @ linear_offset = linear_offset+temp2/2
571 add r6, r6, r11 @ j = j+temp2@
576 stmfd sp!, {r0-r3, r12} @ backup parameters
583 ldmfd sp!, {r0-r3, r12} @ restore parameters
585 vld2.8 {q0, q1}, [r7]!
586 vld2.8 {q2, q3}, [r7]
600 stmfd sp!, {r0-r3, r12} @ backup parameters
607 ldmfd sp!, {r0-r3, r12} @ restore parameters
609 vld2.8 {q0, q1}, [r7]!
610 vld2.8 {q2, q3}, [r7]
623 stmfd sp!, {r0-r3, r12} @ backup parameters
630 ldmfd sp!, {r0-r3, r12} @ restore parameters
632 stmfd sp!, {r9-r12} @ backup parameters
635 bl INTERLEAVED_MEMCOPY_UNDER_64
636 ldmfd sp!, {r9-r12} @ restore parameters
640 ldr r14, [sp, #56] @ buttom
641 add r5, r5, #1 @ i=i+1
642 sub r14, r4, r14 @ i<yuv420_height-buttom
649 ldr r5, [sp, #48] @ i = top
652 ldr r12, [sp, #52] @ linear_offset = right
653 ldr r6, [sp, #44] @ j = left
654 ldr r9, [sp, #48] @ linear_offset = top
656 sub r12, r3, r12 @ temp3 = yuv420_width-right
657 sub r14, r12, r6 @ temp4 = temp3-left@
658 sub r9, r5, r9 @ r9 = i-top
659 mul r9, r14, r9 @ temp4*(i-top)
660 mov r9, r9, lsr #1 @ linear_offset = temp4*(i-top)/2
664 stmfd sp!, {r0-r3, r12} @ backup parameters
671 ldmfd sp!, {r0-r3, r12} @ restore parameters
673 and r14, r6, #0x3 @ temp4 = j&0x3@
674 add r7, r7, r14 @ tiled_offset = tiled_offset+temp4@
682 ldr r14, [sp, #52] @ right
683 add r6, r6, #2 @ j=j+2
684 sub r14, r3, r14 @ j<yuv420_width-right
686 blt LOOP_HEIGHT_2_WIDTH
688 ldr r14, [sp, #56] @ buttom
689 add r5, r5, #1 @ i=i+1
690 sub r14, r4, r14 @ i<yuv420_height-buttom
695 ldmfd sp!, {r4-r12,r15} @ restore registers
697 INTERLEAVED_MEMCOPY_UNDER_64: @ count=r10, src=r11
699 blt INTERLEAVED_MEMCOPY_UNDER_32
700 vld2.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset+temp1, 64}
703 add r12, r0, r9 @ r12 = yuv420_u_dest+linear_offset
704 vst1.8 {q0}, [r12] @ load {nv12t_src+tiled_offset+temp1, 64}
705 add r12, r1, r9 @ r12 = yuv420_v_dest+linear_offset
706 vst1.8 {q1}, [r12] @ load {nv12t_src+tiled_offset+temp1, 64}
708 beq INTERLEAVED_MEMCOPY_UNDER_END
709 INTERLEAVED_MEMCOPY_UNDER_32:
711 blt INTERLEAVED_MEMCOPY_UNDER_16
712 vld2.8 {q0}, [r11]! @ load {nv12t_src+tiled_offset+temp1, 64}
715 add r12, r0, r9 @ r12 = yuv420_u_dest+linear_offset
716 vst1.8 {d0}, [r12]! @ load {nv12t_src+tiled_offset+temp1, 64}
717 add r12, r1, r9 @ r12 = yuv420_v_dest+linear_offset
718 vst1.8 {d1}, [r12]! @ load {nv12t_src+tiled_offset+temp1, 64}
720 beq INTERLEAVED_MEMCOPY_UNDER_END
721 INTERLEAVED_MEMCOPY_UNDER_16:
723 add r8, r0, r9 @ r8 = yuv420_u_dest+linear_offset
725 add r8, r1, r9 @ r8 = yuv420_v_dest+linear_offset
730 bne INTERLEAVED_MEMCOPY_UNDER_16
732 INTERLEAVED_MEMCOPY_UNDER_END:
733 and r10, r6, #0x3F @ temp1 = left(==j)&0x3F
745 stmfd sp!, {r4, r5, lr}
756 eor r4, r4, r3, asr #5
763 orr r1, ip, r1, asl #4
768 eor r1, r2, r3, asr #5
775 orr r1, r1, r4, asl #4
781 orr r3, r1, r3, asl #13
782 orr r0, r3, r2, asl #11
783 ldmfd sp!, {r4, r5, pc}