2 * Copyright 2012 The Android Open Source Project
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
8 #include "SkBlitRow_opts_arm_neon.h"
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
14 #include "SkMathPriv.h"
17 #include "SkColor_opts_neon.h"
21 static inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
23 uint8x8_t vsrc_0, vsrc_1, vsrc_2;
26 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27 "mov %[vsrc0].8b, v0.8b \t\n"
28 "mov %[vsrc1].8b, v1.8b \t\n"
29 "mov %[vsrc2].8b, v2.8b \t\n"
30 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31 [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32 : : "v0", "v1", "v2", "v3"
42 static inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
44 uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
47 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48 "mov %[vsrc0].8b, v0.8b \t\n"
49 "mov %[vsrc1].8b, v1.8b \t\n"
50 "mov %[vsrc2].8b, v2.8b \t\n"
51 "mov %[vsrc3].8b, v3.8b \t\n"
52 : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53 [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
55 : : "v0", "v1", "v2", "v3"
67 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
68 const SkPMColor* SK_RESTRICT src, int count,
69 U8CPU alpha, int /*x*/, int /*y*/) {
70 SkASSERT(255 == alpha);
78 vsrc = sk_vld4_u8_arm64_3(src);
80 vsrc = vld4_u8((uint8_t*)src);
85 vdst = SkPixel32ToPixel16_neon8(vsrc);
90 // Prepare next iteration
99 *dst = SkPixel32ToPixel16_ToU16(c);
105 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
106 const SkPMColor* SK_RESTRICT src, int count,
107 U8CPU alpha, int /*x*/, int /*y*/) {
108 SkASSERT(255 > alpha);
110 uint16x8_t vmask_blue, vscale;
113 vscale = vdupq_n_u16(SkAlpha255To256(alpha));
114 vmask_blue = vmovq_n_u16(0x1F);
118 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
119 uint16x8_t vres_r, vres_g, vres_b;
123 vsrc = sk_vld4_u8_arm64_3(src);
126 register uint8x8_t d0 asm("d0");
127 register uint8x8_t d1 asm("d1");
128 register uint8x8_t d2 asm("d2");
129 register uint8x8_t d3 asm("d3");
132 "vld4.8 {d0-d3},[%[src]]!"
133 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
142 // Load and unpack dst
143 vdst = vld1q_u16(dst);
144 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
145 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
146 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
147 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
149 // Shift src to 565 range
150 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
155 vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156 vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157 vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
159 vres_r = vshrq_n_u16(vres_r * vscale, 8);
160 vres_g = vshrq_n_u16(vres_g * vscale, 8);
161 vres_b = vshrq_n_u16(vres_b * vscale, 8);
168 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue
169 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue
172 vst1q_u16(dst, vres_b);
177 int scale = SkAlpha255To256(alpha);
179 SkPMColor c = *src++;
182 *dst++ = SkPackRGB16(
183 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
184 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
185 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
186 } while (--count != 0);
191 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192 const SkPMColor* SK_RESTRICT src, int count,
193 U8CPU alpha, int /*x*/, int /*y*/) {
194 SkASSERT(255 == alpha);
197 uint16_t* SK_RESTRICT keep_dst = 0;
200 "ands ip, %[count], #7 \n\t"
201 "vmov.u8 d31, #1<<7 \n\t"
202 "vld1.16 {q12}, [%[dst]] \n\t"
203 "vld4.8 {d0-d3}, [%[src]] \n\t"
204 // Thumb does not support the standard ARM conditional
205 // instructions but instead requires the 'it' instruction
206 // to signal conditional execution
209 "mov %[keep_dst], %[dst] \n\t"
211 "add %[src], %[src], ip, LSL#2 \n\t"
212 "add %[dst], %[dst], ip, LSL#1 \n\t"
213 "subs %[count], %[count], ip \n\t"
218 "vld1.16 {q12}, [%[dst]]! \n\t"
219 "vld4.8 {d0-d3}, [%[src]]! \n\t"
220 "vst1.16 {q10}, [%[keep_dst]] \n\t"
221 "sub %[keep_dst], %[dst], #8*2 \n\t"
222 "subs %[count], %[count], #8 \n\t"
224 "pld [%[dst],#32] \n\t"
225 // expand 0565 q12 to 8888 {d4-d7}
226 "vmovn.u16 d4, q12 \n\t"
227 "vshr.u16 q11, q12, #5 \n\t"
228 "vshr.u16 q10, q12, #6+5 \n\t"
229 "vmovn.u16 d5, q11 \n\t"
230 "vmovn.u16 d6, q10 \n\t"
231 "vshl.u8 d4, d4, #3 \n\t"
232 "vshl.u8 d5, d5, #2 \n\t"
233 "vshl.u8 d6, d6, #3 \n\t"
235 "vmovl.u8 q14, d31 \n\t"
236 "vmovl.u8 q13, d31 \n\t"
237 "vmovl.u8 q12, d31 \n\t"
239 // duplicate in 4/2/1 & 8pix vsns
240 "vmvn.8 d30, d3 \n\t"
241 "vmlal.u8 q14, d30, d6 \n\t"
242 "vmlal.u8 q13, d30, d5 \n\t"
243 "vmlal.u8 q12, d30, d4 \n\t"
244 "vshr.u16 q8, q14, #5 \n\t"
245 "vshr.u16 q9, q13, #6 \n\t"
246 "vaddhn.u16 d6, q14, q8 \n\t"
247 "vshr.u16 q8, q12, #5 \n\t"
248 "vaddhn.u16 d5, q13, q9 \n\t"
249 "vqadd.u8 d6, d6, d0 \n\t" // moved up
250 "vaddhn.u16 d4, q12, q8 \n\t"
251 // intentionally don't calculate alpha
254 "vqadd.u8 d5, d5, d1 \n\t"
255 "vqadd.u8 d4, d4, d2 \n\t"
257 // pack 8888 {d4-d6} to 0565 q10
258 "vshll.u8 q10, d6, #8 \n\t"
259 "vshll.u8 q3, d5, #8 \n\t"
260 "vshll.u8 q2, d4, #8 \n\t"
261 "vsri.u16 q10, q3, #5 \n\t"
262 "vsri.u16 q10, q2, #11 \n\t"
267 "vst1.16 {q10}, [%[keep_dst]] \n\t"
268 : [count] "+r" (count)
269 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
270 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
271 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
276 { // handle count < 8
277 uint16_t* SK_RESTRICT keep_dst = 0;
280 "vmov.u8 d31, #1<<7 \n\t"
281 "mov %[keep_dst], %[dst] \n\t"
283 "tst %[count], #4 \n\t"
285 "vld1.16 {d25}, [%[dst]]! \n\t"
286 "vld1.32 {q1}, [%[src]]! \n\t"
289 "tst %[count], #2 \n\t"
291 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
292 "vld1.32 {d1}, [%[src]]! \n\t"
295 "tst %[count], #1 \n\t"
297 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
298 "vld1.32 {d0[1]}, [%[src]]! \n\t"
301 // unzips achieve the same as a vld4 operation
302 "vuzp.u16 q0, q1 \n\t"
303 "vuzp.u8 d0, d1 \n\t"
304 "vuzp.u8 d2, d3 \n\t"
305 // expand 0565 q12 to 8888 {d4-d7}
306 "vmovn.u16 d4, q12 \n\t"
307 "vshr.u16 q11, q12, #5 \n\t"
308 "vshr.u16 q10, q12, #6+5 \n\t"
309 "vmovn.u16 d5, q11 \n\t"
310 "vmovn.u16 d6, q10 \n\t"
311 "vshl.u8 d4, d4, #3 \n\t"
312 "vshl.u8 d5, d5, #2 \n\t"
313 "vshl.u8 d6, d6, #3 \n\t"
315 "vmovl.u8 q14, d31 \n\t"
316 "vmovl.u8 q13, d31 \n\t"
317 "vmovl.u8 q12, d31 \n\t"
319 // duplicate in 4/2/1 & 8pix vsns
320 "vmvn.8 d30, d3 \n\t"
321 "vmlal.u8 q14, d30, d6 \n\t"
322 "vmlal.u8 q13, d30, d5 \n\t"
323 "vmlal.u8 q12, d30, d4 \n\t"
324 "vshr.u16 q8, q14, #5 \n\t"
325 "vshr.u16 q9, q13, #6 \n\t"
326 "vaddhn.u16 d6, q14, q8 \n\t"
327 "vshr.u16 q8, q12, #5 \n\t"
328 "vaddhn.u16 d5, q13, q9 \n\t"
329 "vqadd.u8 d6, d6, d0 \n\t" // moved up
330 "vaddhn.u16 d4, q12, q8 \n\t"
331 // intentionally don't calculate alpha
334 "vqadd.u8 d5, d5, d1 \n\t"
335 "vqadd.u8 d4, d4, d2 \n\t"
337 // pack 8888 {d4-d6} to 0565 q10
338 "vshll.u8 q10, d6, #8 \n\t"
339 "vshll.u8 q3, d5, #8 \n\t"
340 "vshll.u8 q2, d4, #8 \n\t"
341 "vsri.u16 q10, q3, #5 \n\t"
342 "vsri.u16 q10, q2, #11 \n\t"
345 "tst %[count], #4 \n\t"
347 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
350 "tst %[count], #2 \n\t"
352 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
355 "tst %[count], #1 \n\t"
357 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
360 : [count] "+r" (count)
361 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
362 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
363 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
369 #else // #ifdef SK_CPU_ARM32
371 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
372 const SkPMColor* SK_RESTRICT src, int count,
373 U8CPU alpha, int /*x*/, int /*y*/) {
374 SkASSERT(255 == alpha);
378 "movi v4.8h, #0x80 \t\n"
381 "sub %[count], %[count], #16 \t\n"
382 "ld1 {v16.8h-v17.8h}, [%[dst]] \t\n"
383 "ld4 {v0.16b-v3.16b}, [%[src]], #64 \t\n"
384 "prfm pldl1keep, [%[src],#512] \t\n"
385 "prfm pldl1keep, [%[dst],#256] \t\n"
386 "ushr v20.8h, v17.8h, #5 \t\n"
387 "ushr v31.8h, v16.8h, #5 \t\n"
388 "xtn v6.8b, v31.8h \t\n"
389 "xtn2 v6.16b, v20.8h \t\n"
390 "ushr v20.8h, v17.8h, #11 \t\n"
391 "shl v19.16b, v6.16b, #2 \t\n"
392 "ushr v31.8h, v16.8h, #11 \t\n"
393 "xtn v22.8b, v31.8h \t\n"
394 "xtn2 v22.16b, v20.8h \t\n"
395 "shl v18.16b, v22.16b, #3 \t\n"
396 "mvn v3.16b, v3.16b \t\n"
397 "xtn v16.8b, v16.8h \t\n"
398 "mov v7.16b, v4.16b \t\n"
399 "xtn2 v16.16b, v17.8h \t\n"
400 "umlal v7.8h, v3.8b, v19.8b \t\n"
401 "shl v16.16b, v16.16b, #3 \t\n"
402 "mov v22.16b, v4.16b \t\n"
403 "ushr v24.8h, v7.8h, #6 \t\n"
404 "umlal v22.8h, v3.8b, v18.8b \t\n"
405 "ushr v20.8h, v22.8h, #5 \t\n"
406 "addhn v20.8b, v22.8h, v20.8h \t\n"
407 "cmp %[count], #16 \t\n"
408 "mov v6.16b, v4.16b \t\n"
409 "mov v5.16b, v4.16b \t\n"
410 "umlal v6.8h, v3.8b, v16.8b \t\n"
411 "umlal2 v5.8h, v3.16b, v19.16b \t\n"
412 "mov v17.16b, v4.16b \t\n"
413 "ushr v19.8h, v6.8h, #5 \t\n"
414 "umlal2 v17.8h, v3.16b, v18.16b \t\n"
415 "addhn v7.8b, v7.8h, v24.8h \t\n"
416 "ushr v18.8h, v5.8h, #6 \t\n"
417 "ushr v21.8h, v17.8h, #5 \t\n"
418 "addhn2 v7.16b, v5.8h, v18.8h \t\n"
419 "addhn2 v20.16b, v17.8h, v21.8h \t\n"
420 "mov v22.16b, v4.16b \t\n"
421 "addhn v6.8b, v6.8h, v19.8h \t\n"
422 "umlal2 v22.8h, v3.16b, v16.16b \t\n"
423 "ushr v5.8h, v22.8h, #5 \t\n"
424 "addhn2 v6.16b, v22.8h, v5.8h \t\n"
425 "uqadd v7.16b, v1.16b, v7.16b \t\n"
426 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
427 "uqadd v20.16b, v2.16b, v20.16b \t\n"
428 "uqadd v6.16b, v0.16b, v6.16b \t\n"
429 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
430 "uqadd v20.16b, v0.16b, v20.16b \t\n"
431 "uqadd v6.16b, v2.16b, v6.16b \t\n"
433 #error "This function only supports BGRA and RGBA."
435 "shll v22.8h, v20.8b, #8 \t\n"
436 "shll v5.8h, v7.8b, #8 \t\n"
437 "sri v22.8h, v5.8h, #5 \t\n"
438 "shll v17.8h, v6.8b, #8 \t\n"
439 "shll2 v23.8h, v20.16b, #8 \t\n"
440 "shll2 v7.8h, v7.16b, #8 \t\n"
441 "sri v22.8h, v17.8h, #11 \t\n"
442 "sri v23.8h, v7.8h, #5 \t\n"
443 "shll2 v6.8h, v6.16b, #8 \t\n"
444 "st1 {v22.8h}, [%[dst]], #16 \t\n"
445 "sri v23.8h, v6.8h, #11 \t\n"
446 "st1 {v23.8h}, [%[dst]], #16 \t\n"
448 : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
449 :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
450 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
457 SkPMColor c = *src++;
460 *dst = SkSrcOver32To16(c, *dst);
463 } while (--count != 0);
466 #endif // #ifdef SK_CPU_ARM32
468 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
469 prod += vdupq_n_u16(128);
470 prod += vshrq_n_u16(prod, 8);
471 return vshrq_n_u16(prod, 8);
474 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
475 const SkPMColor* SK_RESTRICT src, int count,
476 U8CPU alpha, int /*x*/, int /*y*/) {
477 SkASSERT(255 > alpha);
479 /* This code implements a Neon version of S32A_D565_Blend. The results have
480 * a few mismatches compared to the original code. These mismatches never
485 uint16x8_t valpha_max, vmask_blue;
489 valpha_max = vmovq_n_u16(255);
490 valpha = vdup_n_u8(alpha);
491 vmask_blue = vmovq_n_u16(SK_B16_MASK);
494 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
495 uint16x8_t vres_a, vres_r, vres_g, vres_b;
499 vdst = vld1q_u16(dst);
501 vsrc = sk_vld4_u8_arm64_4(src);
503 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
505 "vld4.u8 %h[vsrc], [%[src]]!"
506 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
510 register uint8x8_t d0 asm("d0");
511 register uint8x8_t d1 asm("d1");
512 register uint8x8_t d2 asm("d2");
513 register uint8x8_t d3 asm("d3");
516 "vld4.u8 {d0-d3},[%[src]]!;"
517 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
526 #endif // #ifdef SK_CPU_ARM64
530 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
531 vdst_b = vdst & vmask_blue; // extract blue
532 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
533 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
536 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
537 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
538 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
540 // calc src * src_scale
541 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
542 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
543 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
544 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
547 vres_a = SkDiv255Round_neon8(vres_a);
548 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
550 // add dst * dst_scale to previous result
551 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
552 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
553 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
555 #ifdef S32A_D565_BLEND_EXACT
556 // It is possible to get exact results with this but it is slow,
557 // even slower than C code in some cases
558 vres_r = SkDiv255Round_neon8(vres_r);
559 vres_g = SkDiv255Round_neon8(vres_g);
560 vres_b = SkDiv255Round_neon8(vres_b);
562 vres_r = vrshrq_n_u16(vres_r, 8);
563 vres_g = vrshrq_n_u16(vres_g, 8);
564 vres_b = vrshrq_n_u16(vres_b, 8);
567 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
568 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
571 vst1q_u16(dst, vres_b);
574 } while (count >= 8);
578 while (count-- > 0) {
579 SkPMColor sc = *src++;
582 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
583 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
584 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
585 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
586 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
592 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
593 * each dither value is spaced out into byte lanes, and repeated
594 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
597 static const uint8_t gDitherMatrix_Neon[48] = {
598 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
599 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
600 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
601 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
605 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
606 int count, U8CPU alpha, int x, int y)
609 SkASSERT(255 > alpha);
611 // rescale alpha to range 1 - 256
612 int scale = SkAlpha255To256(alpha);
615 /* select row and offset for dither array */
616 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
618 uint8x8_t vdither = vld1_u8(dstart); // load dither values
619 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
621 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
622 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
627 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
628 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
629 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
630 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
632 uint16x8_t vdst_r, vdst_g, vdst_b;
633 int16x8_t vres_r, vres_g, vres_b;
634 int8x8_t vres8_r, vres8_g, vres8_b;
636 // Load source and add dither
638 vsrc = sk_vld4_u8_arm64_3(src);
641 register uint8x8_t d0 asm("d0");
642 register uint8x8_t d1 asm("d1");
643 register uint8x8_t d2 asm("d2");
644 register uint8x8_t d3 asm("d3");
647 "vld4.8 {d0-d3},[%[src]]! "
648 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
656 vsrc_r = vsrc.val[NEON_R];
657 vsrc_g = vsrc.val[NEON_G];
658 vsrc_b = vsrc.val[NEON_B];
660 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
661 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
662 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
664 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
665 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
666 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
668 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
669 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
670 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
672 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
673 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
674 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
676 // Load dst and unpack
677 vdst = vld1q_u16(dst);
678 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
679 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
680 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
682 // subtract dst from src and widen
683 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
684 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
685 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
687 // multiply diffs by scale and shift
688 vres_r = vmulq_s16(vres_r, vscale);
689 vres_g = vmulq_s16(vres_g, vscale);
690 vres_b = vmulq_s16(vres_b, vscale);
692 vres8_r = vshrn_n_s16(vres_r, 8);
693 vres8_g = vshrn_n_s16(vres_g, 8);
694 vres8_b = vshrn_n_s16(vres_b, 8);
697 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
698 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
699 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
701 // put result into 565 format
702 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
703 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
706 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
712 } while (count >= 8);
717 int scale = SkAlpha255To256(alpha);
720 SkPMColor c = *src++;
723 int dither = DITHER_VALUE(x);
724 int sr = SkGetPackedR32(c);
725 int sg = SkGetPackedG32(c);
726 int sb = SkGetPackedB32(c);
727 sr = SkDITHER_R32To565(sr, dither);
728 sg = SkDITHER_G32To565(sg, dither);
729 sb = SkDITHER_B32To565(sb, dither);
732 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
733 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
734 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
736 } while (--count != 0);
740 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
741 const SkPMColor* SK_RESTRICT src,
742 int count, U8CPU alpha) {
744 SkASSERT(255 == alpha);
748 uint8x8_t alpha_mask;
750 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
751 alpha_mask = vld1_u8(alpha_mask_setup);
753 /* do the NEON unrolled code */
755 while (count >= UNROLL) {
756 uint8x8_t src_raw, dst_raw, dst_final;
757 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
759 /* The two prefetches below may make the code slighlty
760 * slower for small values of count but are worth having
761 * in the general case.
763 __builtin_prefetch(src+32);
764 __builtin_prefetch(dst+32);
767 src_raw = vreinterpret_u8_u32(vld1_u32(src));
769 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
772 /* get and hold the dst too */
773 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
775 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
778 /* 1st and 2nd bits of the unrolling */
780 uint8x8_t dst_cooked;
782 uint8x8_t alpha_narrow;
783 uint16x8_t alpha_wide;
785 /* get the alphas spread out properly */
786 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
787 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
789 /* spread the dest */
790 dst_wide = vmovl_u8(dst_raw);
792 /* alpha mul the dest */
793 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
794 dst_cooked = vshrn_n_u16(dst_wide, 8);
796 /* sum -- ignoring any byte lane overflows */
797 dst_final = vadd_u8(src_raw, dst_cooked);
801 /* the 3rd and 4th bits of our unrolling */
803 uint8x8_t dst_cooked;
805 uint8x8_t alpha_narrow;
806 uint16x8_t alpha_wide;
808 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
809 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
811 /* spread the dest */
812 dst_wide = vmovl_u8(dst_raw_2);
814 /* alpha mul the dest */
815 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
816 dst_cooked = vshrn_n_u16(dst_wide, 8);
818 /* sum -- ignoring any byte lane overflows */
819 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
823 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
825 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
834 /* do any residual iterations */
835 while (--count >= 0) {
836 *dst = SkPMSrcOver(*src, *dst);
843 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
844 const SkPMColor* SK_RESTRICT src,
845 int count, U8CPU alpha) {
846 SkASSERT(255 == alpha);
851 /* Use these to check if src is transparent or opaque */
852 const unsigned int ALPHA_OPAQ = 0xFF000000;
853 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
856 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
857 const SkPMColor* SK_RESTRICT src_temp = src;
859 /* set up the NEON variables */
860 uint8x8_t alpha_mask;
861 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
862 alpha_mask = vld1_u8(alpha_mask_setup);
864 uint8x8_t src_raw, dst_raw, dst_final;
865 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
866 uint8x8_t dst_cooked;
868 uint8x8_t alpha_narrow;
869 uint16x8_t alpha_wide;
871 /* choose the first processing type */
874 if(*src <= ALPHA_TRANS)
876 if(*src >= ALPHA_OPAQ)
884 src_raw = vreinterpret_u8_u32(vld1_u32(src));
885 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
887 /* get and hold the dst too */
888 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
889 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
892 /* get the alphas spread out properly */
893 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
894 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
895 /* we collapsed (255-a)+1 ... */
896 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
898 /* spread the dest */
899 dst_wide = vmovl_u8(dst_raw);
901 /* alpha mul the dest */
902 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
903 dst_cooked = vshrn_n_u16(dst_wide, 8);
905 /* sum -- ignoring any byte lane overflows */
906 dst_final = vadd_u8(src_raw, dst_cooked);
908 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
909 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
910 /* we collapsed (255-a)+1 ... */
911 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
913 /* spread the dest */
914 dst_wide = vmovl_u8(dst_raw_2);
916 /* alpha mul the dest */
917 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
918 dst_cooked = vshrn_n_u16(dst_wide, 8);
920 /* sum -- ignoring any byte lane overflows */
921 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
923 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
924 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
929 /* if 2 of the next pixels aren't between 1 and 254
930 it might make sense to go to the optimized loops */
931 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
934 } while(src < src_end);
939 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
946 /*In this state, we know the current alpha is 0 and
947 we optimize for the next alpha also being zero. */
948 src_temp = src; //so we don't have to increment dst every time
950 if(*(++src) > ALPHA_TRANS)
952 if(*(++src) > ALPHA_TRANS)
954 if(*(++src) > ALPHA_TRANS)
956 if(*(++src) > ALPHA_TRANS)
958 } while(src < src_end);
960 dst += (src - src_temp);
962 /* no longer alpha 0, so determine where to go next. */
965 if(*src >= ALPHA_OPAQ)
971 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
983 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
984 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
985 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
991 if(*src <= ALPHA_TRANS)
997 /* do any residual iterations */
998 src_end += UNROLL + 1; //goto the real end
999 while(src != src_end) {
1001 if( *src >= ALPHA_OPAQ ) {
1005 *dst = SkPMSrcOver(*src, *dst);
1016 /* Neon version of S32_Blend_BlitRow32()
1017 * portable version is in src/core/SkBlitRow_D32.cpp
1019 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1020 const SkPMColor* SK_RESTRICT src,
1021 int count, U8CPU alpha) {
1022 SkASSERT(alpha <= 255);
1028 uint16_t src_scale = SkAlpha255To256(alpha);
1029 uint16_t dst_scale = 256 - src_scale;
1031 while (count >= 2) {
1032 uint8x8_t vsrc, vdst, vres;
1033 uint16x8_t vsrc_wide, vdst_wide;
1035 /* These commented prefetches are a big win for count
1036 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
1037 * They also hurt a little (<5%) on an A15
1039 //__builtin_prefetch(src+32);
1040 //__builtin_prefetch(dst+32);
1043 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1044 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1047 vsrc_wide = vmovl_u8(vsrc);
1048 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1051 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1054 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1057 vst1_u32(dst, vreinterpret_u32_u8(vres));
1065 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1066 uint16x8_t vsrc_wide, vdst_wide;
1069 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1070 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1073 vsrc_wide = vmovl_u8(vsrc);
1074 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
1075 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
1076 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1079 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1084 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
1085 const SkPMColor* SK_RESTRICT src,
1086 int count, U8CPU alpha) {
1088 SkASSERT(255 >= alpha);
1094 unsigned alpha256 = SkAlpha255To256(alpha);
1096 // First deal with odd counts
1098 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
1099 uint16x8_t vdst_wide, vsrc_wide;
1103 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
1104 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
1107 dst_scale = vget_lane_u8(vsrc, 3);
1108 dst_scale *= alpha256;
1110 dst_scale = 256 - dst_scale;
1113 vsrc_wide = vmovl_u8(vsrc);
1114 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
1117 vdst_wide = vmovl_u8(vdst);
1118 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
1121 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1123 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
1130 uint8x8_t alpha_mask;
1131 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
1132 alpha_mask = vld1_u8(alpha_mask_setup);
1136 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
1137 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
1139 __builtin_prefetch(src+32);
1140 __builtin_prefetch(dst+32);
1143 vsrc = vreinterpret_u8_u32(vld1_u32(src));
1144 vdst = vreinterpret_u8_u32(vld1_u32(dst));
1146 // Prepare src_scale
1147 vsrc_scale = vdupq_n_u16(alpha256);
1150 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
1151 vdst_scale = vmovl_u8(vsrc_alphas);
1152 vdst_scale *= vsrc_scale;
1153 vdst_scale = vshrq_n_u16(vdst_scale, 8);
1154 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
1157 vsrc_wide = vmovl_u8(vsrc);
1158 vsrc_wide *= vsrc_scale;
1161 vdst_wide = vmovl_u8(vdst);
1162 vdst_wide *= vdst_scale;
1165 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1167 vst1_u32(dst, vreinterpret_u32_u8(vres));
1176 ///////////////////////////////////////////////////////////////////////////////
1178 #undef DEBUG_OPAQUE_DITHER
1180 #if defined(DEBUG_OPAQUE_DITHER)
1181 static void showme8(char *str, void *p, int len)
1183 static char buf[256];
1186 char *pc = (char*) p;
1187 sprintf(buf,"%8s:", str);
1188 for(i=0;i<len;i++) {
1189 sprintf(tbuf, " %02x", pc[i]);
1192 SkDebugf("%s\n", buf);
1194 static void showme16(char *str, void *p, int len)
1196 static char buf[256];
1199 uint16_t *pc = (uint16_t*) p;
1200 sprintf(buf,"%8s:", str);
1201 len = (len / sizeof(uint16_t)); /* passed as bytes */
1202 for(i=0;i<len;i++) {
1203 sprintf(tbuf, " %04x", pc[i]);
1206 SkDebugf("%s\n", buf);
1209 #endif // #ifdef SK_CPU_ARM32
1211 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1212 const SkPMColor* SK_RESTRICT src,
1213 int count, U8CPU alpha, int x, int y) {
1214 SkASSERT(255 == alpha);
1218 if (count >= UNROLL) {
1220 #if defined(DEBUG_OPAQUE_DITHER)
1221 uint16_t tmpbuf[UNROLL];
1226 uint16_t in_dst[UNROLL];
1232 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1233 dbase = vld1_u8(dstart);
1237 uint8x8_t sr, sg, sb, sa, d;
1238 uint16x8_t dst8, scale8, alpha8;
1239 uint16x8_t dst_r, dst_g, dst_b;
1241 #if defined(DEBUG_OPAQUE_DITHER)
1242 // calculate 8 elements worth into a temp buffer
1246 SkPMColor* my_src = (SkPMColor*)src;
1247 uint16_t* my_dst = dst;
1250 DITHER_565_SCAN(my_y);
1251 for(i = 0; i < UNROLL; i++) {
1252 SkPMColor c = *my_src++;
1255 unsigned a = SkGetPackedA32(c);
1257 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1258 tdv[i] = DITHER_VALUE(my_x);
1260 tap[i] = SkAlpha255To256(a);
1263 unsigned sr = SkGetPackedR32(c);
1264 unsigned sg = SkGetPackedG32(c);
1265 unsigned sb = SkGetPackedB32(c);
1266 sr = SkDITHER_R32_FOR_565(sr, d);
1267 sg = SkDITHER_G32_FOR_565(sg, d);
1268 sb = SkDITHER_B32_FOR_565(sb, d);
1270 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1271 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1272 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1273 // now src and dst expanded are in g:11 r:10 x:1 b:10
1274 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1277 tmpbuf[i] = *my_dst;
1278 ta[i] = tdv[i] = td[i] = 0xbeef;
1280 in_dst[i] = *my_dst;
1288 vsrc = sk_vld4_u8_arm64_4(src);
1291 register uint8x8_t d0 asm("d0");
1292 register uint8x8_t d1 asm("d1");
1293 register uint8x8_t d2 asm("d2");
1294 register uint8x8_t d3 asm("d3");
1296 asm ("vld4.8 {d0-d3},[%[src]]! "
1297 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1306 sa = vsrc.val[NEON_A];
1307 sr = vsrc.val[NEON_R];
1308 sg = vsrc.val[NEON_G];
1309 sb = vsrc.val[NEON_B];
1311 /* calculate 'd', which will be 0..7
1312 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1314 alpha8 = vmovl_u8(dbase);
1315 alpha8 = vmlal_u8(alpha8, sa, dbase);
1316 d = vshrn_n_u16(alpha8, 8); // narrowing too
1318 // sr = sr - (sr>>5) + d
1319 /* watching for 8-bit overflow. d is 0..7; risky range of
1320 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1321 * safe as long as we do ((sr-sr>>5) + d)
1323 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1324 sr = vadd_u8(sr, d);
1326 // sb = sb - (sb>>5) + d
1327 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1328 sb = vadd_u8(sb, d);
1330 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1331 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1332 sg = vadd_u8(sg, vshr_n_u8(d,1));
1334 // need to pick up 8 dst's -- at 16 bits each, 128 bits
1335 dst8 = vld1q_u16(dst);
1336 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1337 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1338 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
1341 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1343 // combine the addq and mul, save 3 insns
1344 scale8 = vshrq_n_u16(scale8, 3);
1345 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1346 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1347 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1350 dst8 = vshrq_n_u16(dst_b, 5);
1351 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1352 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1354 vst1q_u16(dst, dst8);
1356 #if defined(DEBUG_OPAQUE_DITHER)
1357 // verify my 8 elements match the temp buffer
1360 static int invocation;
1362 for (i = 0; i < UNROLL; i++) {
1363 if (tmpbuf[i] != dst[i]) {
1368 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1369 invocation, offset);
1370 SkDebugf(" alpha 0x%x\n", alpha);
1371 for (i = 0; i < UNROLL; i++)
1372 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1373 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1374 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1376 showme16("alpha8", &alpha8, sizeof(alpha8));
1377 showme16("scale8", &scale8, sizeof(scale8));
1378 showme8("d", &d, sizeof(d));
1379 showme16("dst8", &dst8, sizeof(dst8));
1380 showme16("dst_b", &dst_b, sizeof(dst_b));
1381 showme16("dst_g", &dst_g, sizeof(dst_g));
1382 showme16("dst_r", &dst_r, sizeof(dst_r));
1383 showme8("sb", &sb, sizeof(sb));
1384 showme8("sg", &sg, sizeof(sg));
1385 showme8("sr", &sr, sizeof(sr));
1395 // skip x += UNROLL, since it's unchanged mod-4
1396 } while (count >= UNROLL);
1404 SkPMColor c = *src++;
1407 unsigned a = SkGetPackedA32(c);
1409 // dither and alpha are just temporary variables to work-around
1411 unsigned dither = DITHER_VALUE(x);
1412 unsigned alpha = SkAlpha255To256(a);
1413 int d = SkAlphaMul(dither, alpha);
1415 unsigned sr = SkGetPackedR32(c);
1416 unsigned sg = SkGetPackedG32(c);
1417 unsigned sb = SkGetPackedB32(c);
1418 sr = SkDITHER_R32_FOR_565(sr, d);
1419 sg = SkDITHER_G32_FOR_565(sg, d);
1420 sb = SkDITHER_B32_FOR_565(sb, d);
1422 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1423 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1424 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1425 // now src and dst expanded are in g:11 r:10 x:1 b:10
1426 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1430 } while (--count != 0);
1434 ///////////////////////////////////////////////////////////////////////////////
1436 #undef DEBUG_S32_OPAQUE_DITHER
1438 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1439 const SkPMColor* SK_RESTRICT src,
1440 int count, U8CPU alpha, int x, int y) {
1441 SkASSERT(255 == alpha);
1444 if (count >= UNROLL) {
1446 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1447 d = vld1_u8(dstart);
1449 while (count >= UNROLL) {
1450 uint8x8_t sr, sg, sb;
1451 uint16x8_t dr, dg, db;
1456 vsrc = sk_vld4_u8_arm64_3(src);
1459 register uint8x8_t d0 asm("d0");
1460 register uint8x8_t d1 asm("d1");
1461 register uint8x8_t d2 asm("d2");
1462 register uint8x8_t d3 asm("d3");
1465 "vld4.8 {d0-d3},[%[src]]! "
1466 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1474 sr = vsrc.val[NEON_R];
1475 sg = vsrc.val[NEON_G];
1476 sb = vsrc.val[NEON_B];
1478 /* XXX: if we want to prefetch, hide it in the above asm()
1479 * using the gcc __builtin_prefetch(), the prefetch will
1480 * fall to the bottom of the loop -- it won't stick up
1481 * at the top of the loop, just after the vld4.
1484 // sr = sr - (sr>>5) + d
1485 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1486 dr = vaddl_u8(sr, d);
1488 // sb = sb - (sb>>5) + d
1489 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1490 db = vaddl_u8(sb, d);
1492 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1493 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1494 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1496 // pack high bits of each into 565 format (rgb, b is lsb)
1497 dst8 = vshrq_n_u16(db, 3);
1498 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1499 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1502 vst1q_u16(dst, dst8);
1504 #if defined(DEBUG_S32_OPAQUE_DITHER)
1505 // always good to know if we generated good results
1507 int i, myx = x, myy = y;
1508 DITHER_565_SCAN(myy);
1509 for (i=0;i<UNROLL;i++) {
1510 // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1511 SkPMColor c = src[i-8];
1512 unsigned dither = DITHER_VALUE(myx);
1513 uint16_t val = SkDitherRGB32To565(c, dither);
1514 if (val != dst[i]) {
1515 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1516 c, dither, val, dst[i], dstart[i]);
1524 // we don't need to increment src as the asm above has already done it
1526 x += UNROLL; // probably superfluous
1535 SkPMColor c = *src++;
1537 SkASSERT(SkGetPackedA32(c) == 255);
1539 unsigned dither = DITHER_VALUE(x);
1540 *dst++ = SkDitherRGB32To565(c, dither);
1542 } while (--count != 0);
1546 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1554 memcpy(dst, src, count * sizeof(SkPMColor));
1559 unsigned colorA = SkGetPackedA32(color);
1560 if (255 == colorA) {
1561 sk_memset32(dst, color, count);
1565 unsigned scale = 256 - SkAlpha255To256(colorA);
1571 vcolor = vdupq_n_u32(color);
1573 // scale numerical interval [0-255], so load as 8 bits
1574 vscale = vdup_n_u8(scale);
1577 // load src color, 8 pixels, 4 64 bit registers
1578 // (and increment src).
1580 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1582 "vld1.32 %h[vsrc], [%[src]]!"
1583 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1586 #else // 64bit targets and Clang
1587 vsrc.val[0] = vld1_u32(src);
1588 vsrc.val[1] = vld1_u32(src+2);
1589 vsrc.val[2] = vld1_u32(src+4);
1590 vsrc.val[3] = vld1_u32(src+6);
1594 // multiply long by scale, 64 bits at a time,
1595 // destination into a 128 bit register.
1597 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1598 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1599 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1600 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1602 // shift the 128 bit registers, containing the 16
1603 // bit scaled values back to 8 bits, narrowing the
1604 // results to 64 bit registers.
1606 vres.val[0] = vcombine_u8(
1607 vshrn_n_u16(vtmp.val[0], 8),
1608 vshrn_n_u16(vtmp.val[1], 8));
1609 vres.val[1] = vcombine_u8(
1610 vshrn_n_u16(vtmp.val[2], 8),
1611 vshrn_n_u16(vtmp.val[3], 8));
1613 // adding back the color, using 128 bit registers.
1615 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1616 vreinterpretq_u8_u32(vcolor));
1617 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1618 vreinterpretq_u8_u32(vcolor));
1620 // store back the 8 calculated pixels (2 128 bit
1621 // registers), and increment dst.
1622 #if defined(SK_CPU_ARM32) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)))
1624 "vst1.32 %h[vdst], [%[dst]]!"
1629 #else // 64bit targets and Clang
1630 vst1q_u32(dst, vdst.val[0]);
1631 vst1q_u32(dst+4, vdst.val[1]);
1636 } while (count >= 8);
1640 *dst = color + SkAlphaMulQ(*src, scale);
1647 ///////////////////////////////////////////////////////////////////////////////
1649 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1651 S32_D565_Opaque_neon,
1652 S32_D565_Blend_neon,
1653 S32A_D565_Opaque_neon,
1655 S32A_D565_Blend_neon,
1657 NULL, // https://code.google.com/p/skia/issues/detail?id=2845
1658 // https://code.google.com/p/skia/issues/detail?id=2797
1662 S32_D565_Opaque_Dither_neon,
1663 S32_D565_Blend_Dither_neon,
1664 S32A_D565_Opaque_Dither_neon,
1665 NULL, // S32A_D565_Blend_Dither
1668 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1669 NULL, // S32_Opaque,
1670 S32_Blend_BlitRow32_neon, // S32_Blend,
1672 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1673 * value and attempts to optimize accordingly. The optimization is
1674 * sensitive to the source content and is not a win in all cases. For
1675 * example, if there are a lot of transitions between the alpha states,
1676 * the performance will almost certainly be worse. However, for many
1677 * common cases the performance is equivalent or better than the standard
1678 * case where we do not inspect the src alpha.
1680 #if SK_A32_SHIFT == 24
1681 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1682 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1684 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1687 S32A_Blend_BlitRow32_neon // S32A_Blend