3 * Copyright 2012 Samsung Electronics S.LSI Co. LTD
5 * Licensed under the Apache License, Version 2.0 (the "License")
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
19 * @file csc_tiled_to_linear_uv_deinterleave_neon.s
20 * @brief SEC_OMX specific define. It support MFC 6.x tiled.
21 * @author ShinWon Lee (shinwon.lee@samsung.com)
28 * Converts and Deinterleave tiled data to linear for mfc 6.x
29 * 1. UV of NV12T to Y of YUV420P
32 * U address of YUV420[out]
35 * V address of YUV420[out]
38 * UV address of NV12T[in]
41 * real width of YUV420[in]. It should be even.
43 * @param yuv420_height
44 * real height of YUV420[in] It should be even.
49 .global csc_tiled_to_linear_uv_deinterleave_neon
50 .type csc_tiled_to_linear_uv_deinterleave_neon, %function
51 csc_tiled_to_linear_uv_deinterleave_neon:
54 .equ CACHE_LINE_SIZE, 64
55 .equ PRE_LOAD_OFFSET, 6
72 stmfd sp!, {r4-r12,r14} @ backup registers
73 ldr r4, [sp, #40] @ r4 = height
75 bic r9, r4, #0x7 @ aligned_height = height & (~0x7)
76 bic r10, r3, #0xF @ aligned_width = width & (~0xF)
77 add r11, r3, #15 @ tiled_width = ((width + 15) >> 4) << 4
82 LOOP_MAIN_ALIGNED_HEIGHT:
83 mul r8, r11, r5 @ src_offset = tiled_width * i
85 add r8, r2, r8 @ src_offset = uv_src + src_offset
86 LOOP_MAIN_ALIGNED_WIDTH:
87 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
90 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
91 vld2.8 {q0, q1}, [r8]!
92 add r12, r12, r6, asr #1
93 vld2.8 {q2, q3}, [r8]!
94 add r7, r0, r12 @ dst_offset = u_dst + temp1
95 pld [r8, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
96 vld2.8 {q4, q5}, [r8]!
97 mov r14, r3, asr #1 @ temp2 = width / 2
98 vld2.8 {q6, q7}, [r8]!
100 vst1.8 {d0}, [r7], r14
101 vst1.8 {d1}, [r7], r14
102 vst1.8 {d4}, [r7], r14
103 vst1.8 {d5}, [r7], r14
104 vst1.8 {d8}, [r7], r14
105 vst1.8 {d9}, [r7], r14
106 vst1.8 {d12}, [r7], r14
107 vst1.8 {d13}, [r7], r14
109 add r7, r1, r12 @ dst_offset = v_dst + temp1
111 vst1.8 {d2}, [r7], r14
112 vst1.8 {d3}, [r7], r14
113 vst1.8 {d6}, [r7], r14
114 vst1.8 {d7}, [r7], r14
115 vst1.8 {d10}, [r7], r14
116 vst1.8 {d11}, [r7], r14
118 vst1.8 {d14}, [r7], r14
120 vst1.8 {d15}, [r7], r14
121 blt LOOP_MAIN_ALIGNED_WIDTH
123 MAIN_REMAIN_WIDTH_START:
124 cmp r10, r3 @ if (aligned_width != width) {
125 beq MAIN_REMAIN_WIDTH_END
126 stmfd sp!, {r0-r2,r4} @ backup registers
127 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3)
128 add r8, r8, r6, lsl #3
129 add r8, r2, r8 @ r8 = uv_src + src_offset
130 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
132 add r12, r12, r6, asr #1
133 add r7, r0, r12 @ r7 = u_dst + temp1
134 add r12, r1, r12 @ r12 = v_dst + temp1
135 sub r14, r3, r6 @ r14 = (width - j) / 2
139 LOOP_MAIN_REMAIN_HEIGHT:
140 mov r0, #0 @ r0 is index in de-interleave
141 LOOP_MAIN_REMAIN_WIDTH:
148 blt LOOP_MAIN_REMAIN_WIDTH
150 sub r8, r8, r14, lsl #1
154 add r7, r7, r3, asr #1
155 add r12, r12, r3, asr #1
159 blt LOOP_MAIN_REMAIN_HEIGHT
160 ldmfd sp!, {r0-r2,r4} @ restore registers
161 MAIN_REMAIN_WIDTH_END:
165 blt LOOP_MAIN_ALIGNED_HEIGHT
168 cmp r9, r4 @ if (aligned_height != height) {
169 beq REMAIN_HEIGHT_END
172 LOOP_REMAIN_HEIGHT_WIDTH16:
173 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3)
174 add r8, r8, r6, lsl #3
175 add r8, r2, r8 @ src_offset = uv_src + src_offset
177 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
179 add r12, r12, r6, asr #1
180 add r7, r0, r12 @ r7 = u_dst + temp1
181 add r12, r1, r12 @ r12 = v_dst + temp1
182 mov r14, r3, asr #1 @ temp2 = width / 2
184 stmfd sp!, {r0-r1} @ backup registers
187 LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1:
188 vld2.8 {d0, d1}, [r8]!
189 vst1.8 {d0}, [r7], r14
190 vst1.8 {d1}, [r12], r14
194 blt LOOP_REMAIN_HEIGHT_WIDTH16_HEIGHT1
195 ldmfd sp!, {r0-r1} @ restore registers
199 blt LOOP_REMAIN_HEIGHT_WIDTH16
201 REMAIN_HEIGHT_REMAIN_WIDTH_START:
203 beq REMAIN_HEIGHT_REMAIN_WIDTH_END
204 mul r8, r11, r5 @ src_offset = (tiled_width * i) + (j << 3)
205 add r8, r8, r6, lsl #3
206 add r8, r2, r8 @ src_offset = uv_src + src_offset
208 mov r12, r3, asr #1 @ temp1 = (width >> 1) * i + (j >> 1)
210 add r12, r12, r6, asr #1
211 add r7, r0, r12 @ r7 = u_dst + temp1
212 add r12, r1, r12 @ r12 = v_dst + temp1
213 sub r14, r3, r6 @ r14 = (width - j) /2
216 stmfd sp!, {r0-r2,r4-r5} @ backup registers
219 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1:
222 LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx:
229 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1_WIDTHx
231 sub r8, r8, r14, lsl #1
235 add r7, r7, r3, asr #1
236 add r12, r12, r3, asr #1
240 blt LOOP_REMAIN_HEIGHT_REMAIN_WIDTH_HEIGHT1
241 ldmfd sp!, {r0-r2,r4-r5} @ restore registers
243 REMAIN_HEIGHT_REMAIN_WIDTH_END:
248 ldmfd sp!, {r4-r12,r15} @ restore registers