4 .global csc_RGBA8888_to_YUV420SP_NEON
5 .type csc_RGBA8888_to_YUV420SP_NEON, %function
6 csc_RGBA8888_to_YUV420SP_NEON:
13 @r4 pDstY2 = pDstY + nWidth
14 @r5 pSrcRGB2 = pSrcRGB + nWidthx2
16 @r7 temp6, accumilator
20 @r11 temp2, immediate operand
22 @r14 temp0, debugging pointer
24 .equ CACHE_LINE_SIZE, 32
25 .equ PRE_LOAD_OFFSET, 6
27 stmfd sp!, {r4-r12,r14} @ backup registers
28 ldr r12, [sp, #40] @ load nHeight
29 @ldr r14, [sp, #44] @ load pTest
30 add r4, r0, r3 @r4: pDstY2 = pDstY + nWidth
31 add r5, r2, r3, lsl #2 @r5: pSrcRGB2 = tmpSrcRGB + nWidthx4
32 sub r8, r3, #16 @r8: nWidthTmp = nWidth -16
42 vmov.u16 q6, #66 @coefficient assignment
45 vmov.u16 q9, #0x8080 @ 128<<8 + 128
47 vmov.u16 q10, #0x1000 @ 16<<8 + 128
50 vmov.u16 q11, #38 @#-38
51 vmov.u16 q12, #74 @#-74
53 vmov.u16 q14, #94 @#-94
54 vmov.u16 q15, #18 @#-18
60 stmfd sp!, {r12} @ backup registers
63 pld [r2, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
64 @-------------------------------------------YUV ------------------------------------------
65 vmov.u16 q14, #94 @#94
66 vmov.u16 q15, #18 @#18
67 vld4.8 {d0,d1,d2,d3}, [r2]! @loadRGB interleavely
68 vld4.8 {d4,d5,d6,d7}, [r2]! @loadRGB interleavely
84 vand.u16 q4,#0x00FF @R
85 vand.u16 q5,#0x00FF @G
86 vand.u16 q6,#0x00FF @B
88 vmov.u16 q8,q9 @ CalcU()
89 vmla.u16 q8,q6,q13 @112 * B[k]
90 vmls.u16 q8,q4,q11 @q0:U -(38 * R[k]) @128<<6+ 32 + u>>2
91 vmls.u16 q8,q5,q12 @-(74 * G[k])
92 vshr.u16 q8,q8, #8 @(128<<8+ 128 + u)>>8
94 vmov.u16 q7,q9 @CalcV()
95 vmla.u16 q7,q4,q13 @112 * R[k]
96 vmls.u16 q7,q5,q14 @q0:U -(94 * G[k]) @128<<6+ 32 + v>>2
97 vmls.u16 q7,q6,q15 @-(18 * B[k])
98 vshr.u16 q7,q7, #8 @(128<<8+ 128 + v)>>8
102 vst1.8 {q8}, [r1]! @write UV component to yuv420_buffer+linear_ylanesiez
104 @-------------------------------------------Y ------------------------------------------
106 vmov.u16 q14, #66 @#66
107 vmov.u16 q15, #129 @#129
108 vmov.u16 q8, #25 @#25
112 vmul.u16 q7,q4,q14 @q0 = 66 *R[k]
113 vmla.u16 q7,q5,q15 @q0 += 129 *G[k]
114 vmla.u16 q7,q6,q8 @q0 += 25 *B[k]
136 vmul.u16 q0,q4,q14 @q0 = 66 *R[k]
137 vmla.u16 q0,q5,q15 @q0 += 129 *G[k]
138 vmla.u16 q0,q6,q8 @q0 += 25 *B[k]
143 vst1.8 {q7}, [r0]!@write to Y to yuv420_buffer
147 @-------------------------------------------Y ------------------------------------------
149 @---------------------------------------------Y1-------------------------------------------
151 pld [r5, #(CACHE_LINE_SIZE*PRE_LOAD_OFFSET)]
152 vld4.8 {d0,d1,d2,d3}, [r5]! @loadRGB interleavely
153 vld4.8 {d4,d5,d6,d7}, [r5]! @loadRGB interleavely
168 vand.u16 q4,#0x00FF @R
169 vand.u16 q5,#0x00FF @G
170 vand.u16 q6,#0x00FF @B
174 vmul.u16 q7,q4,q14 @q0 = 66 *R[k]
175 vmla.u16 q7,q5,q15 @q0 += 129 *G[k]
176 vmla.u16 q7,q6,q8 @q0 += 25 *B[k]
197 vmul.u16 q0,q4,q14 @q0 = 66 *R[k]
198 vmla.u16 q0,q5,q15 @q0 += 129 *G[k]
199 vmla.u16 q0,q6,q8 @q0 += 25 *B[k]
204 vst1.8 {q7}, [r4]!@write to Y to yuv420_buffer
206 subs r8,r8,#16 @nWidth16--
207 BPL LOOP_NWIDTH16 @if nWidth16>0
208 @-----------------------------------unaligned ---------------------------------------
210 adds r8,r8,#16 @ + 16 - 2
211 BEQ NO_UNALIGNED @in case that nWidht is multiple of 16
213 @----------------------------------pDstRGB1--Y------------------------------------------
214 @stmfd sp!, {r14} @backup r14
217 ldr r9, [r2], #4 @loadRGB int
218 ldr r12, [r2], #4 @loadRGB int
220 mov r10, r9,lsr #16 @copy to r10
221 mov r14, r12 @copy to r10
224 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
226 and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
229 mov r11, #66 @accumilator += R*66
232 mov r10, r9,lsr #8 @copy to r10
233 mov r14, r12,lsl #8 @copy to r10
241 mov r11, #129 @accumilator += G *129
244 mov r10, r9 @copy to r10
245 mov r14, r12,lsl #16 @copy to r10
253 mov r11, #25 @accumilator 1 -= B *25
263 @ldmfd sp!, {r14} @load r14
266 @----------------------------------pDstRGB2--UV------------------------------------------
268 mov r10, r9 @copy to r10
275 mov r11, #112 @accumilator += B*112
279 mov r11, #18 @accumilator -= B*18
286 mov r10, r9, lsr #16 @copy to r10
288 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
290 mov r11, #38 @accumilator -= R *38
294 mov r11, #112 @accumilator = R *112
295 mla r12, r10, r11, r12
297 mov r10, r9,lsr #8 @copy to r10
299 and r10, r10, r6 @G: (rgbIn[k] & 0x07E0) >> 5;
301 mov r11, #74 @accumilator -= G*74
305 mov r11, #94 @accumilator -= G*94
314 @----------------------------------pDstRGB2--Y------------------------------------------
315 @stmfd sp!, {r14} @backup r14
318 ldr r9, [r5], #4 @loadRGB int
319 ldr r12, [r5], #4 @loadRGB int
321 mov r10, r9,lsr #16 @copy to r10
322 mov r14, r12 @copy to r10
325 and r10, r10, r6 @R: (rgbIn[k] & 0xF800) >> 10;
327 and r14, r14, r6 @R: (rgbIn[k] & 0xF800) >> 10;
330 mov r11, #66 @accumilator += R*66
333 mov r10, r9,lsr #8 @copy to r10
334 mov r14, r12,lsl #8 @copy to r10
342 mov r11, #129 @accumilator += G *129
345 mov r10, r9 @copy to r10
346 mov r14, r12,lsl #16 @copy to r10
357 mov r11, #25 @accumilator 1 -= B *25
367 @ldmfd sp!, {r14} @load r14
370 subs r8,r8,#2 @ nWidth2 -= 2
371 BGT LOOP_NWIDTH2 @ if nWidth2>0
374 NO_UNALIGNED: @in case that nWidht is multiple of 16
376 @-----------------------------------------------------------------------------
377 sub r8, r3, #16 @r8: nWidthTmp = nWidth -16
378 add r0, r0, r3 @pDstY + nwidth
379 add r2, r2, r3, lsl #2 @pSrcRGB + nwidthx4
380 add r4, r4, r3 @pDstY2 + nwidth
381 add r5, r5, r3, lsl #2 @pSrcRGB2 + nwidthx4
384 subs r12,r12,#2 @nHeight -=2
385 BGT LOOP_NHEIGHT2 @if nHeight2>0
387 ldmfd sp!, {r4-r12,pc} @ backup registers