3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
19 * @file csc_linear_to_tiled_crop_neon.s
20 * @brief SEC_OMX specific define
21 * @author ShinWon Lee (shinwon.lee@samsung.com)
28 * Converts linear data to tiled
29 * Crops left, top, right, buttom
30 * 1. Y of YUV420P to Y of NV12T
31 * 2. Y of YUV420S to Y of NV12T
32 * 3. UV of YUV420S to UV of NV12T
35 * Y or UV plane address of NV12T[out]
38 * Y or UV plane address of YUV420P(S)[in]
43 * @param yuv420_height
44 * Y: Height of YUV420, UV: Height/2 of YUV420[in]
47 * Crop size of left. It should be even.
50 * Crop size of top. It should be even.
53 * Crop size of right. It should be even.
56 * Crop size of buttom. It should be even.
61 .global csc_linear_to_tiled_crop_neon
62 .type csc_linear_to_tiled_crop_neon, %function
63 csc_linear_to_tiled_crop_neon:
81 stmfd sp!, {r4-r12,r14} @ backup registers
83 ldr r11, [sp, #44] @ top
84 ldr r14, [sp, #52] @ buttom
85 ldr r10, [sp, #40] @ left
86 ldr r12, [sp, #48] @ right
88 sub r9, r3, r11 @ aligned_y_size = ((yuv420_height-top-buttom)>>5)<<5
92 sub r8, r2, r10 @ aligned_x_size = ((yuv420_width-left-right)>>6)<<6
104 ldr r10, [sp, #44] @ r10 = top
105 ldr r14, [sp, #40] @ r14 = left
106 add r10, r5, r10 @ temp1 = linear_x_size*(i+top)
108 add r7, r1, r4 @ linear_addr = linear_src+j
109 add r7, r7, r10 @ linear_addr = linear_addr+temp1
110 add r7, r7, r14 @ linear_addr = linear_addr+left
114 vld1.8 {q0, q1}, [r7]! @ load {linear_src, 64}
116 vld1.8 {q2, q3}, [r7], r10
118 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*1, 64}
120 vld1.8 {q6, q7}, [r7], r10
122 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*2, 64}
124 vld1.8 {q10, q11}, [r7], r10
126 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*3, 64}
128 vld1.8 {q14, q15}, [r7], r10
129 add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr
130 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr}
131 vst1.8 {q2, q3}, [r6]!
132 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*1}
133 vst1.8 {q6, q7}, [r6]!
134 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*2}
135 vst1.8 {q10, q11}, [r6]!
136 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*3}
137 vst1.8 {q14, q15}, [r6]!
140 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*4, 64}
142 vld1.8 {q2, q3}, [r7], r10
144 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*5, 64}
146 vld1.8 {q6, q7}, [r7], r10
148 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*6, 64}
150 vld1.8 {q10, q11}, [r7], r10
152 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*7, 64}
154 vld1.8 {q14, q15}, [r7], r10
155 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*4}
156 vst1.8 {q2, q3}, [r6]!
157 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*5}
158 vst1.8 {q6, q7}, [r6]!
159 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*6}
160 vst1.8 {q10, q11}, [r6]!
161 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*7}
162 vst1.8 {q14, q15}, [r6]!
165 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*8, 64}
167 vld1.8 {q2, q3}, [r7], r10
169 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*9, 64}
171 vld1.8 {q6, q7}, [r7], r10
173 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*10, 64}
175 vld1.8 {q10, q11}, [r7], r10
177 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*11, 64}
179 vld1.8 {q14, q15}, [r7], r10
180 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*8}
181 vst1.8 {q2, q3}, [r6]!
182 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*9}
183 vst1.8 {q6, q7}, [r6]!
184 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*10}
185 vst1.8 {q10, q11}, [r6]!
186 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*11}
187 vst1.8 {q14, q15}, [r6]!
190 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*12, 64}
192 vld1.8 {q2, q3}, [r7], r10
194 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*13, 64}
196 vld1.8 {q6, q7}, [r7], r10
198 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*14, 64}
200 vld1.8 {q10, q11}, [r7], r10
202 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*15, 64}
204 vld1.8 {q14, q15}, [r7], r10
205 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*12}
206 vst1.8 {q2, q3}, [r6]!
207 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*13}
208 vst1.8 {q6, q7}, [r6]!
209 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*14}
210 vst1.8 {q10, q11}, [r6]!
211 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*15}
212 vst1.8 {q14, q15}, [r6]!
215 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*16, 64}
217 vld1.8 {q2, q3}, [r7], r10
219 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*17, 64}
221 vld1.8 {q6, q7}, [r7], r10
223 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*18, 64}
225 vld1.8 {q10, q11}, [r7], r10
227 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*19, 64}
229 vld1.8 {q14, q15}, [r7], r10
230 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*16}
231 vst1.8 {q2, q3}, [r6]!
232 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*17}
233 vst1.8 {q6, q7}, [r6]!
234 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*18}
235 vst1.8 {q10, q11}, [r6]!
236 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*19}
237 vst1.8 {q14, q15}, [r6]!
240 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*20, 64}
242 vld1.8 {q2, q3}, [r7], r10
244 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*21, 64}
246 vld1.8 {q6, q7}, [r7], r10
248 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*22, 64}
250 vld1.8 {q10, q11}, [r7], r10
252 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*23, 64}
254 vld1.8 {q14, q15}, [r7], r10
255 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*20}
256 vst1.8 {q2, q3}, [r6]!
257 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*21}
258 vst1.8 {q6, q7}, [r6]!
259 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*22}
260 vst1.8 {q10, q11}, [r6]!
261 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*23}
262 vst1.8 {q14, q15}, [r6]!
265 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*24, 64}
267 vld1.8 {q2, q3}, [r7], r10
269 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*25, 64}
271 vld1.8 {q6, q7}, [r7], r10
273 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*26, 64}
275 vld1.8 {q10, q11}, [r7], r10
277 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*27, 64}
279 vld1.8 {q14, q15}, [r7], r10
280 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*24}
281 vst1.8 {q2, q3}, [r6]!
282 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*25}
283 vst1.8 {q6, q7}, [r6]!
284 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*26}
285 vst1.8 {q10, q11}, [r6]!
286 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*27}
287 vst1.8 {q14, q15}, [r6]!
290 vld1.8 {q0, q1}, [r7]! @ load {linear_src+linear_x_size*28, 64}
292 vld1.8 {q2, q3}, [r7], r10
294 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*29, 64}
296 vld1.8 {q6, q7}, [r7], r10
298 vld1.8 {q8, q9}, [r7]! @ load {linear_src+linear_x_size*30, 64}
300 vld1.8 {q10, q11}, [r7], r10
301 vld1.8 {q12, q13}, [r7]! @ load {linear_src+linear_x_size*31, 64}
302 vld1.8 {q14, q15}, [r7], r10
303 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr+64*28}
304 vst1.8 {q2, q3}, [r6]!
305 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*29}
306 vst1.8 {q6, q7}, [r6]!
307 vst1.8 {q8, q9}, [r6]! @ store {tiled_addr+64*30}
308 vst1.8 {q10, q11}, [r6]!
309 vst1.8 {q12, q13}, [r6]! @ store {tiled_addr+64*31}
310 vst1.8 {q14, q15}, [r6]!
312 add r4, r4, #64 @ j = j+64
313 cmp r4, r8 @ j<aligned_x_size
314 blt LOOP_ALIGNED_X_SIZE
316 add r5, r5, #32 @ i = i+32
317 cmp r5, r9 @ i<aligned_y_size
318 blt LOOP_ALIGNED_Y_SIZE
320 ldr r10, [sp, #44] @ r10 = top
321 ldr r11, [sp, #52] @ r11 = buttom
324 cmp r5, r10 @ i == (yuv420_height-top-buttom)
325 beq LOOP_LINEAR_Y_SIZE_2_START
327 LOOP_LINEAR_Y_SIZE_1:
330 LOOP_ALIGNED_X_SIZE_1:
334 ldr r10, [sp, #44] @ r10 = top
335 ldr r14, [sp, #40] @ r14 = left
336 add r10, r5, r10 @ temp1 = yuv420_width*(i+top)
338 add r7, r1, r4 @ linear_addr = linear_src+j
339 add r7, r7, r10 @ linear_addr = linear_addr+temp1
340 add r7, r7, r14 @ linear_addr = linear_addr+left
341 sub r10, r2, #32 @ temp1 = yuv420_width-32
344 vld1.8 {q0, q1}, [r7]! @ load {linear_src, 64}
346 vld1.8 {q2, q3}, [r7], r10
347 vld1.8 {q4, q5}, [r7]! @ load {linear_src+linear_x_size*1, 64}
348 vld1.8 {q6, q7}, [r7]
349 add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr
350 and r10, r5, #0x1F @ temp1 = i&0x1F
351 mov r10, r10, lsl #6 @ temp1 = 64*temp1
352 add r6, r6, r10 @ tiled_addr = tiled_addr+temp1
353 vst1.8 {q0, q1}, [r6]! @ store {tiled_addr}
354 vst1.8 {q2, q3}, [r6]!
355 vst1.8 {q4, q5}, [r6]! @ store {tiled_addr+64*1}
356 vst1.8 {q6, q7}, [r6]!
358 add r4, r4, #64 @ j = j+64
359 cmp r4, r8 @ j<aligned_x_size
360 blt LOOP_ALIGNED_X_SIZE_1
362 add r5, r5, #2 @ i = i+2
363 ldr r10, [sp, #44] @ r10 = top
364 ldr r14, [sp, #52] @ r14 = buttom
367 cmp r5, r10 @ i<yuv420_height-top-buttom
368 blt LOOP_LINEAR_Y_SIZE_1
370 LOOP_LINEAR_Y_SIZE_2_START:
371 ldr r10, [sp, #40] @ r10 = left
372 ldr r11, [sp, #48] @ r11 = right
375 cmp r8, r10 @ aligned_x_size == (yuv420_width-left-right)
379 LOOP_LINEAR_Y_SIZE_2:
381 mov r4, r8 @ j = aligned_x_size
382 LOOP_LINEAR_X_SIZE_2:
386 ldr r10, [sp, #44] @ r14 = top
387 ldr r14, [sp, #40] @ r10 = left
389 mul r10, r2, r10 @ temp1 = linear_x_size*(i+top)
390 add r7, r1, r4 @ linear_addr = linear_src+j
391 add r7, r7, r10 @ linear_addr = linear_addr+temp1
392 add r7, r7, r14 @ linear_addr = linear_addr+left
394 add r6, r0, r6 @ tiled_addr = tiled_dest+tiled_addr
395 and r11, r5, #0x1F @ temp2 = i&0x1F
396 mov r11, r11, lsl #6 @ temp2 = 64*temp2
397 add r6, r6, r11 @ tiled_addr = tiled_addr+temp2
398 and r11, r4, #0x3F @ temp2 = j&0x3F
399 add r6, r6, r11 @ tiled_addr = tiled_addr+temp2
406 ldr r12, [sp, #40] @ r12 = left
407 ldr r14, [sp, #48] @ r14 = right
408 add r4, r4, #2 @ j = j+2
411 cmp r4, r12 @ j<(yuv420_width-left-right)
412 blt LOOP_LINEAR_X_SIZE_2
414 ldr r12, [sp, #44] @ r12 = top
415 ldr r14, [sp, #52] @ r14 = buttom
416 add r5, r5, #2 @ i = i+2
419 cmp r5, r12 @ i<(yuv420_height-top-buttom)
420 blt LOOP_LINEAR_Y_SIZE_2
423 ldmfd sp!, {r4-r12,r15} @ restore registers
427 mov r11, r5, asr #5 @ temp2 = i>>5
428 mov r10, r4, asr #6 @ temp1 = j>>6
430 and r12, r11, #0x1 @ if (temp2 & 0x1)
432 bne GET_TILED_OFFSET_EVEN_FORMULA_1
434 GET_TILED_OFFSET_ODD_FORMULA:
435 sub r6, r11, #1 @ tiled_addr = temp2-1
437 ldr r7, [sp, #40] @ left
438 add r12, r2, #127 @ temp3 = linear_x_size+127
440 ldr r7, [sp, #48] @ right
442 bic r12, r12, #0x7F @ temp3 = (temp3 >>7)<<7
443 mov r12, r12, asr #6 @ temp3 = temp3>>6
444 mul r6, r6, r12 @ tiled_addr = tiled_addr*temp3
445 add r6, r6, r10 @ tiled_addr = tiled_addr+temp1
446 add r6, r6, #2 @ tiled_addr = tiled_addr+2
447 bic r12, r10, #0x3 @ temp3 = (temp1>>2)<<2
448 add r6, r6, r12 @ tiled_addr = tiled_addr+temp3
449 mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11
450 b GET_TILED_OFFSET_RETURN
452 GET_TILED_OFFSET_EVEN_FORMULA_1:
453 ldr r7, [sp, #44] @ top
454 add r12, r3, #31 @ temp3 = linear_y_size+31
456 ldr r7, [sp, #52] @ buttom
458 bic r12, r12, #0x1F @ temp3 = (temp3>>5)<<5
459 sub r12, r12, #32 @ temp3 = temp3 - 32
460 cmp r5, r12 @ if (i<(temp3-32)) {
461 bge GET_TILED_OFFSET_EVEN_FORMULA_2
462 add r12, r10, #2 @ temp3 = temp1+2
463 bic r12, r12, #3 @ temp3 = (temp3>>2)<<2
464 add r6, r10, r12 @ tiled_addr = temp1+temp3
465 ldr r7, [sp, #40] @ left
466 add r12, r2, #127 @ temp3 = linear_x_size+127
468 ldr r7, [sp, #48] @ right
470 bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7
471 mov r12, r12, asr #6 @ temp3 = temp3>>6
472 mul r11, r11, r12 @ tiled_y_index = tiled_y_index*temp3
473 add r6, r6, r11 @ tiled_addr = tiled_addr+tiled_y_index
474 mov r6, r6, lsl #11 @
475 b GET_TILED_OFFSET_RETURN
477 GET_TILED_OFFSET_EVEN_FORMULA_2:
478 ldr r7, [sp, #40] @ left
479 add r12, r2, #127 @ temp3 = linear_x_size+127
481 ldr r7, [sp, #48] @ right
483 bic r12, r12, #0x7F @ temp3 = (temp3>>7)<<7
484 mov r12, r12, asr #6 @ temp3 = temp3>>6
485 mul r6, r11, r12 @ tiled_addr = temp2*temp3
486 add r6, r6, r10 @ tiled_addr = tiled_addr+temp3
487 mov r6, r6, lsl #11 @ tiled_addr = tiled_addr<<11@
489 GET_TILED_OFFSET_RETURN: