3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
19 * @file csc_tiled_to_linear_crop_neon.s
20 * @brief SEC_OMX specific define
21 * @author ShinWon Lee (shinwon.lee@samsung.com)
28 * Converts tiled data to linear
29 * Crops left, top, right, buttom
30 * 1. Y of NV12T to Y of YUV420P
31 * 2. Y of NV12T to Y of YUV420S
32 * 3. UV of NV12T to UV of YUV420S
35 * Y or UV plane address of YUV420[out]
38 * Y or UV plane address of NV12T[in]
43 * @param yuv420_height
44 * Y: Height of YUV420, UV: Height/2 of YUV420[in]
47 * Crop size of left. It should be even.
50 * Crop size of top. It should be even.
53 * Crop size of right. It should be even.
56 * Crop size of buttom. It should be even.
61 .global csc_tiled_to_linear_crop_neon
62 .type csc_tiled_to_linear_crop_neon, %function
63 csc_tiled_to_linear_crop_neon:
81 stmfd sp!, {r4-r12,r14} @ backup registers
83 ldr r12, [sp, #48] @ r12 = right
84 ldr r10, [sp, #40] @ r10 = left
85 sub r12, r2, r12 @ temp3 = yuv420_width-right@
86 sub r10, r12, r10 @ temp1 = temp3-left@
87 cmp r10, #256 @ if (temp1 >= 256)
88 blt LOOP_HEIGHT_64_START
90 ldr r5, [sp, #44] @ i = top
92 ldr r6, [sp, #40] @ j = left
93 mov r14, r5, asr #5 @ temp4 = i>>5
94 bic r12, r6, #0xFF @ temp3 = (j>>8)<<8
95 mov r12, r12, asr #6 @ temp3 = temp3>>6
96 and r11, r14, #0x1 @ if (temp4 & 0x1)
98 bne LOOP_HEIGHT_256_GET_TILED_EVEN
99 LOOP_HEIGHT_256_GET_TILED_ODD:
100 sub r7, r14, #1 @ tiled_offset = temp4-1
101 add r10, r2, #127 @ temp1 = ((yuv420_width+127)>>7)<<7
103 mov r10, r10, asr #6 @ tiled_offset = tiled_offset*(temp1>>6)
105 add r7, r7, r12 @ tiled_offset = tiled_offset+temp3
106 add r7, r7, #2 @ tiled_offset = tiled_offset+2
107 bic r10, r12, #0x3 @ temp1 = (temp3>>2)<<2
108 add r7, r7, r10 @ tiled_offset = tiled_offset+temp1
109 mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11
110 add r8, r7, #4096 @ tiled_offset1 = tiled_offset+2048*2
112 b LOOP_HEIGHT_256_GET_TILED_END
114 LOOP_HEIGHT_256_GET_TILED_EVEN:
115 add r11, r3, #31 @ temp2 = ((yuv420_height+31)>>5)<<5
117 add r10, r5, #32 @ if ((i+32)<temp2)
119 bge LOOP_HEIGHT_256_GET_TILED_EVEN1
120 add r10, r12, #2 @ temp1 = temp3+2
121 bic r10, r10, #0x3 @ temp1 = (temp1>>2)<<2
122 add r7, r12, r10 @ tiled_offset = temp3+temp1@
123 add r10, r2, #127 @ temp1 = ((yuv420_width+127)>>7)<<7
125 mov r10, r10, asr #6 @ tiled_offset = tiled_offset+temp4*(temp1>>6)
127 mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11
128 add r8, r7, #12288 @ tiled_offset1 = tiled_offset+2048*6
130 b LOOP_HEIGHT_256_GET_TILED_END
132 LOOP_HEIGHT_256_GET_TILED_EVEN1:
133 add r10, r2, #127 @ temp1 = ((yuv420_width+127)>>7)<<7
135 mov r10, r10, asr #6 @ tiled_offset = temp4*(temp1>>6)
137 add r7, r7, r12 @ tiled_offset = tiled_offset+temp3
138 mov r7, r7, lsl #11 @ tiled_offset = tiled_offset<<11
139 add r8, r7, #4096 @ tiled_offset1 = tiled_offset+2048*2
142 LOOP_HEIGHT_256_GET_TILED_END:
144 ldr r12, [sp, #48] @ right
145 ldr r9, [sp, #44] @ top
146 and r10, r5, #0x1F @ temp1 = i&0x1F
147 add r7, r7, r10, lsl #6 @ tiled_offset = tiled_offset+64*(temp1)
148 add r8, r8, r10, lsl #6 @ tiled_offset1 = tiled_offset1+64*(temp1)
149 sub r11, r2, r6 @ temp2 = yuv420_width-left(==j)-right
151 sub r9, r5, r9 @ linear_offset = temp2*(i-top)@
153 add r12, r6, #256 @ temp3 = ((j+256)>>8)<<8@
155 sub r12, r12, r6 @ temp3 = temp3-j@
156 and r10, r6, #0x3F @ temp1 = left(==j)&0x3F
158 cmp r12, #192 @ if (temp3 > 192)
159 ble LOOP_HEIGHT_256_LEFT_192
160 add r11, r1, r7 @ r11 = nv12t_src+tiled_offset+temp1
163 add r12, r1, r7 @ r12 = nv12t_src+tiled_offset+2048
169 stmnefd sp!, {r9-r12, r14} @ backup registers
171 blne MEMCOPY_UNDER_64
172 ldmnefd sp!, {r9-r12, r14} @ restore registers
173 bne LOOP_HEIGHT_256_LEFT_256_64
174 vld1.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset+temp1, 64}
175 vld1.8 {q2, q3}, [r11]
176 add r11, r0, r9 @ r11 = yuv420_dest+linear_offset
177 vst1.8 {q0, q1}, [r11]! @ store {yuv420_dest+linear_offset, 64}
178 vst1.8 {q2, q3}, [r11]!
179 LOOP_HEIGHT_256_LEFT_256_64:
180 add r11, r1, r8 @ r11 = nv12t_src+tiled_offset1
182 vld1.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048, 64}
184 vld1.8 {q6, q7}, [r12]
185 add r12, r11, #2048 @ r12 = nv12t_src+tiled_offset1+2048
187 vld1.8 {q8, q9}, [r11]! @ load {nv12t_src+tiled_offset1, 64}
189 vld1.8 {q10, q11}, [r11]
190 vld1.8 {q12, q13}, [r12]! @ load {nv12t_src+tiled_offset1+2048, 64}
191 vld1.8 {q14, q15}, [r12]
193 sub r11, r0, r10 @ r11 = yuv420_dest+linear_offset+64-temp1
197 vst1.8 {q4, q5}, [r11]! @ store {yuv420_dest+linear_offset+64-temp1, 64}
198 vst1.8 {q6, q7}, [r11]!
199 vst1.8 {q8, q9}, [r11]! @ store {yuv420_dest+linear_offset+128-temp1, 64}
200 vst1.8 {q10, q11}, [r11]!
201 vst1.8 {q12, q13}, [r11]! @ store {yuv420_dest+linear_offset+192-temp1, 64}
202 vst1.8 {q14, q15}, [r11]!
206 b LOOP_HEIGHT_256_LEFT_END
208 LOOP_HEIGHT_256_LEFT_192:
209 cmp r12, #128 @ if (temp3 > 128)
210 ble LOOP_HEIGHT_256_LEFT_128
211 add r11, r1, r7 @ r11 = nv12t_src+tiled_offset+2048+temp1
215 add r12, r1, r8 @ r12 = nv12t_src+tiled_offset1
219 stmnefd sp!, {r9-r12, r14} @ backup registers
222 blne MEMCOPY_UNDER_64
223 ldmnefd sp!, {r9-r12, r14} @ restore registers
224 bne LOOP_HEIGHT_256_LEFT_192_64
225 vld1.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset+2048+temp1, 64}
226 vld1.8 {q2, q3}, [r11]
227 add r11, r0, r9 @ r11 = yuv420_dest+linear_offset
228 vst1.8 {q0, q1}, [r11]! @ store {yuv420_dest+linear_offset, 64}
229 vst1.8 {q2, q3}, [r11]!
230 LOOP_HEIGHT_256_LEFT_192_64:
231 add r11, r1, r8 @ r11 = nv12t_src+tiled_offset1+2048
234 vld1.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset1, 64}
236 vld1.8 {q6, q7}, [r12]
237 vld1.8 {q8, q9}, [r11]! @ load {nv12t_src+tiled_offset1+2048, 64}
238 vld1.8 {q10, q11}, [r11]
240 sub r11, r0, r10 @ r11 = yuv420_dest+linear_offset+64-temp1
244 vst1.8 {q4, q5}, [r11]! @ store {yuv420_dest+linear_offset+64-temp1, 64}
245 vst1.8 {q6, q7}, [r11]!
246 vst1.8 {q8, q9}, [r11]! @ store {yuv420_dest+linear_offset+128-temp1, 64}
247 vst1.8 {q10, q11}, [r11]!
251 b LOOP_HEIGHT_256_LEFT_END
253 LOOP_HEIGHT_256_LEFT_128:
254 cmp r12, #64 @ if (temp3 > 64)
255 ble LOOP_HEIGHT_256_LEFT_64
256 add r11, r1, r8 @ r11 = nv12t_src+tiled_offset1+temp1
259 add r12, r1, r8 @ r12 = nv12t_src+tiled_offset1
264 stmnefd sp!, {r9-r12, r14} @ backup registers
267 blne MEMCOPY_UNDER_64
268 ldmnefd sp!, {r9-r12, r14} @ restore registers
269 bne LOOP_HEIGHT_256_LEFT_128_64
270 vld1.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset1+temp1, 64}
271 vld1.8 {q2, q3}, [r11]
272 add r11, r0, r9 @ r11 = yuv420_dest+linear_offset
273 vst1.8 {q0, q1}, [r11]! @ store {yuv420_dest+linear_offset, 64}
274 vst1.8 {q2, q3}, [r11]!
275 LOOP_HEIGHT_256_LEFT_128_64:
276 vld1.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset1, 64}
277 vld1.8 {q6, q7}, [r12]
279 sub r11, r0, r10 @ r11 = yuv420_dest+linear_offset+64-temp1
283 vst1.8 {q4, q5}, [r11]! @ store {yuv420_dest+linear_offset+64-temp1, 64}
284 vst1.8 {q6, q7}, [r11]!
288 b LOOP_HEIGHT_256_LEFT_END
290 LOOP_HEIGHT_256_LEFT_64:
291 add r11, r1, r8 @ r11 = nv12t_src+tiled_offset1+2048+temp1
296 stmnefd sp!, {r9-r12, r14} @ backup registers
299 blne MEMCOPY_UNDER_64
300 ldmnefd sp!, {r9-r12, r14} @ restore registers
301 bne LOOP_HEIGHT_256_LEFT_64_64
302 vld1.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset1+temp1, 64}
303 vld1.8 {q2, q3}, [r11]
304 add r11, r0, r9 @ r11 = yuv420_dest+linear_offset
305 vst1.8 {q0, q1}, [r11]! @ store {yuv420_dest+linear_offset, 64}
306 vst1.8 {q2, q3}, [r11]!
307 LOOP_HEIGHT_256_LEFT_64_64:
311 LOOP_HEIGHT_256_LEFT_END:
313 ldr r12, [sp, #48] @ right
314 add r7, r7, r14, lsl #11 @ tiled_offset = tiled_offset+temp4*2048
315 add r10, r1, r7 @ r10 = nv12t_src+tiled_offset
317 bic r6, r6, #0xFF @ j = (left>>8)<<8
319 add r6, r6, #256 @ j = j + 256
320 sub r11, r2, r12 @ temp2 = yuv420_width-right-256
323 bgt LOOP_HEIGHT_256_WIDTH_END
325 LOOP_HEIGHT_256_WIDTH:
326 add r12, r10, #2048 @ r12 = nv12t_src+tiled_offset+2048
328 vld1.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset, 64}
330 vld1.8 {q2, q3}, [r10]
332 add r8, r8, r14, lsl #11 @ tiled_offset1 = tiled_offset1+temp4*2048
333 add r10, r1, r8 @ r10 = nv12t_src+tiled_offset1
335 vld1.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048, 64}
337 vld1.8 {q6, q7}, [r12]
339 add r12, r10, #2048 @ r12 = nv12t_src+tiled_offset+2048
341 vld1.8 {q8, q9}, [r10]! @ load {nv12t_src+tiled_offset+2048, 64}
343 vld1.8 {q10, q11}, [r10]
345 add r7, r7, r14, lsl #11 @ tiled_offset = tiled_offset+temp4*2048
348 vld1.8 {q12, q13}, [r12]! @ load {nv12t_src+tiled_offset+2048, 64}
350 vld1.8 {q14, q15}, [r12]
352 add r12, r0, r9 @ r12 = yuv420_dest+linear_offset
353 vst1.8 {q0, q1}, [r12]!
354 vst1.8 {q2, q3}, [r12]!
355 vst1.8 {q4, q5}, [r12]!
356 vst1.8 {q6, q7}, [r12]!
357 vst1.8 {q8, q9}, [r12]!
358 vst1.8 {q10, q11}, [r12]!
359 vst1.8 {q12, q13}, [r12]!
360 vst1.8 {q14, q15}, [r12]!
361 add r9, r9, #256 @ linear_offset = linear_offset+256
363 add r12, r10, #2048 @ r12 = nv12t_src+tiled_offset+2048
365 add r6, r6, #256 @ j=j+256
366 cmp r6, r11 @ j<=temp2
367 ble LOOP_HEIGHT_256_WIDTH
369 LOOP_HEIGHT_256_WIDTH_END:
371 add r8, r8, r14, lsl #11 @ tiled_offset1 = tiled_offset1+temp4*2048
372 ldr r14, [sp, #48] @ right
373 sub r11, r2, r6 @ temp2 = yuv420_width-right-j
376 beq LOOP_HEIGHT_256_RIGHT_END
378 ble LOOP_HEIGHT_256_RIGHT_192
381 vld1.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset}
383 vld1.8 {q2, q3}, [r10]
385 add r10, r1, r8 @ r10 = nv12t_src+tiled_offset1
387 vld1.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048}
389 vld1.8 {q6, q7}, [r12]
391 add r14, r10, #2048 @ r10 = nv12t_src+tiled_offset1+2048
393 vld1.8 {q8, q9}, [r10]! @ load {nv12t_src+tiled_offset1}
395 vld1.8 {q10, q11}, [r10]
397 add r12, r0, r9 @ r12 = yuv420_dest+linear_offset
398 vst1.8 {q0, q1}, [r12]!
399 vst1.8 {q2, q3}, [r12]!
400 vst1.8 {q4, q5}, [r12]!
401 vst1.8 {q6, q7}, [r12]!
402 vst1.8 {q8, q9}, [r12]!
403 vst1.8 {q10, q11}, [r12]!
404 add r9, r9, #192 @ linear_offset = linear_offset+192
406 stmfd sp!, {r9-r12, r14} @ backup registers
410 ldmfd sp!, {r9-r12, r14} @ restore registers
411 b LOOP_HEIGHT_256_RIGHT_END
413 LOOP_HEIGHT_256_RIGHT_192:
415 ble LOOP_HEIGHT_256_RIGHT_128
418 vld1.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset}
420 vld1.8 {q2, q3}, [r10]
422 add r14, r1, r8 @ r10 = nv12t_src+tiled_offset1
424 vld1.8 {q4, q5}, [r12]! @ load {nv12t_src+tiled_offset+2048}
426 vld1.8 {q6, q7}, [r12]
428 add r12, r0, r9 @ r12 = yuv420_dest+linear_offset
429 vst1.8 {q0, q1}, [r12]!
430 vst1.8 {q2, q3}, [r12]!
431 vst1.8 {q4, q5}, [r12]!
432 vst1.8 {q6, q7}, [r12]!
433 add r9, r9, #128 @ linear_offset = linear_offset+128
435 stmfd sp!, {r9-r12, r14} @ backup registers
439 ldmfd sp!, {r9-r12, r14} @ restore registers
440 b LOOP_HEIGHT_256_RIGHT_END
442 LOOP_HEIGHT_256_RIGHT_128:
444 ble LOOP_HEIGHT_256_RIGHT_64
447 vld1.8 {q0, q1}, [r10]! @ load {nv12t_src+tiled_offset}
449 vld1.8 {q2, q3}, [r10]
451 add r12, r0, r9 @ r12 = yuv420_dest+linear_offset
452 vst1.8 {q0, q1}, [r12]!
453 vst1.8 {q2, q3}, [r12]!
454 add r9, r9, #64 @ linear_offset = linear_offset+64
456 stmfd sp!, {r9-r12, r14} @ backup registers
460 ldmfd sp!, {r9-r12, r14} @ restore registers
461 b LOOP_HEIGHT_256_RIGHT_END
463 LOOP_HEIGHT_256_RIGHT_64:
464 stmfd sp!, {r9-r12, r14} @ backup registers
469 ldmfd sp!, {r9-r12, r14} @ restore registers
471 LOOP_HEIGHT_256_RIGHT_END:
473 ldr r14, [sp, #52] @ buttom
474 add r5, r5, #1 @ i=i+1
475 sub r14, r3, r14 @ i<yuv420_height-buttom
480 LOOP_HEIGHT_64_START:
481 cmp r10, #64 @ if (temp1 >= 64)
482 blt LOOP_HEIGHT_2_START
484 ldr r5, [sp, #44] @ i = top
486 ldr r6, [sp, #40] @ j = left
487 stmfd sp!, {r0-r3, r12} @ backup parameters
494 ldmfd sp!, {r0-r3, r12} @ restore parameters
495 ldr r9, [sp, #44] @ linear_offset = top
496 add r11, r6, #64 @ temp2 = ((j+64)>>6)<<6
498 sub r11, r11, r6 @ temp2 = temp2-j
499 sub r9, r5, r9 @ linear_offset = temp1*(i-top)
501 and r14, r6, #0x3 @ temp4 = j&0x3
502 add r7, r7, r14 @ tiled_offset = tiled_offset+temp4
503 stmfd sp!, {r9-r12} @ backup parameters
507 ldmfd sp!, {r9-r12} @ restore parameters
508 add r9, r9, r11 @ linear_offset = linear_offset+temp2
509 add r6, r6, r11 @ j = j+temp2@
514 stmfd sp!, {r0-r3, r12} @ backup parameters
521 ldmfd sp!, {r0-r3, r12} @ restore parameters
523 vld1.8 {q0, q1}, [r7]!
524 vld1.8 {q2, q3}, [r7]
526 vst1.8 {q0, q1}, [r7]!
527 vst1.8 {q2, q3}, [r7]
535 stmfd sp!, {r0-r3, r12} @ backup parameters
542 ldmfd sp!, {r0-r3, r12} @ restore parameters
544 vld1.8 {q0, q1}, [r7]!
545 vld1.8 {q2, q3}, [r7]
547 vst1.8 {q0, q1}, [r7]!
548 vst1.8 {q2, q3}, [r7]
555 stmfd sp!, {r0-r3, r12} @ backup parameters
562 ldmfd sp!, {r0-r3, r12} @ restore parameters
564 stmfd sp!, {r9-r12} @ backup parameters
568 ldmfd sp!, {r9-r12} @ restore parameters
572 ldr r14, [sp, #52] @ buttom
573 add r5, r5, #1 @ i=i+1
574 sub r14, r3, r14 @ i<yuv420_height-buttom
581 ldr r5, [sp, #44] @ i = top
584 ldr r6, [sp, #40] @ j = left
585 ldr r9, [sp, #44] @ linear_offset = top
586 add r11, r6, #64 @ temp2 = ((j+64)>>6)<<6
588 sub r11, r11, r6 @ temp2 = temp2-j
589 sub r9, r5, r9 @ linear_offset = temp1*(i-top)
591 add r9, r0, r9 @ linear_offset = linear_dst+linear_offset
593 stmfd sp!, {r0-r3, r12} @ backup parameters
600 ldmfd sp!, {r0-r3, r12} @ restore parameters
602 and r14, r6, #0x3 @ temp4 = j&0x3@
603 add r7, r7, r14 @ tiled_offset = tiled_offset+temp4@
609 ldr r14, [sp, #48] @ right
610 add r6, r6, #2 @ j=j+2
611 sub r14, r2, r14 @ j<yuv420_width-right
613 blt LOOP_HEIGHT_2_WIDTH
615 ldr r14, [sp, #52] @ buttom
616 add r5, r5, #1 @ i=i+1
617 sub r14, r3, r14 @ i<yuv420_height-buttom
622 ldmfd sp!, {r4-r12,r15} @ restore registers
624 MEMCOPY_UNDER_64: @ count=r10, src=r11
626 add r9, r0, r9 @ r9 = yuv420_dest+linear_offset
628 vld1.8 {q0, q1}, [r11]! @ load {nv12t_src+tiled_offset+temp1, 64}
631 vst1.8 {q0, q1}, [r9]! @ load {nv12t_src+tiled_offset+temp1, 64}
632 beq MEMCOPY_UNDER_END
636 vld1.8 {q0}, [r11]! @ load {nv12t_src+tiled_offset+temp1, 64}
639 vst1.8 {q0}, [r9]! @ load {nv12t_src+tiled_offset+temp1, 64}
640 beq MEMCOPY_UNDER_END
648 and r10, r6, #0x3F @ temp1 = left(==j)&0x3F
660 stmfd sp!, {r4, r5, lr}
671 eor r4, r4, r3, asr #5
678 orr r1, ip, r1, asl #4
683 eor r1, r2, r3, asr #5
690 orr r1, r1, r4, asl #4
696 orr r3, r1, r3, asl #13
697 orr r0, r3, r2, asl #11
698 ldmfd sp!, {r4, r5, pc}