2 * Copyright 2012 The Android Open Source Project
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
8 #include "SkBlitRow_opts_arm_neon.h"
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
14 #include "SkMathPriv.h"
17 #include "SkCachePreload_arm.h"
18 #include "SkColor_opts_neon.h"
21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
22 const SkPMColor* SK_RESTRICT src, int count,
23 U8CPU alpha, int /*x*/, int /*y*/) {
24 SkASSERT(255 == alpha);
31 vsrc = vld4_u8((uint8_t*)src);
34 vdst = SkPixel32ToPixel16_neon8(vsrc);
39 // Prepare next iteration
49 *dst = SkPixel32ToPixel16_ToU16(c);
55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
56 const SkPMColor* SK_RESTRICT src, int count,
57 U8CPU alpha, int /*x*/, int /*y*/) {
58 SkASSERT(255 == alpha);
61 uint16_t* SK_RESTRICT keep_dst = 0;
64 "ands ip, %[count], #7 \n\t"
65 "vmov.u8 d31, #1<<7 \n\t"
66 "vld1.16 {q12}, [%[dst]] \n\t"
67 "vld4.8 {d0-d3}, [%[src]] \n\t"
68 // Thumb does not support the standard ARM conditional
69 // instructions but instead requires the 'it' instruction
70 // to signal conditional execution
73 "mov %[keep_dst], %[dst] \n\t"
75 "add %[src], %[src], ip, LSL#2 \n\t"
76 "add %[dst], %[dst], ip, LSL#1 \n\t"
77 "subs %[count], %[count], ip \n\t"
82 "vld1.16 {q12}, [%[dst]]! \n\t"
83 "vld4.8 {d0-d3}, [%[src]]! \n\t"
84 "vst1.16 {q10}, [%[keep_dst]] \n\t"
85 "sub %[keep_dst], %[dst], #8*2 \n\t"
86 "subs %[count], %[count], #8 \n\t"
88 "pld [%[dst],#32] \n\t"
89 // expand 0565 q12 to 8888 {d4-d7}
90 "vmovn.u16 d4, q12 \n\t"
91 "vshr.u16 q11, q12, #5 \n\t"
92 "vshr.u16 q10, q12, #6+5 \n\t"
93 "vmovn.u16 d5, q11 \n\t"
94 "vmovn.u16 d6, q10 \n\t"
95 "vshl.u8 d4, d4, #3 \n\t"
96 "vshl.u8 d5, d5, #2 \n\t"
97 "vshl.u8 d6, d6, #3 \n\t"
99 "vmovl.u8 q14, d31 \n\t"
100 "vmovl.u8 q13, d31 \n\t"
101 "vmovl.u8 q12, d31 \n\t"
103 // duplicate in 4/2/1 & 8pix vsns
104 "vmvn.8 d30, d3 \n\t"
105 "vmlal.u8 q14, d30, d6 \n\t"
106 "vmlal.u8 q13, d30, d5 \n\t"
107 "vmlal.u8 q12, d30, d4 \n\t"
108 "vshr.u16 q8, q14, #5 \n\t"
109 "vshr.u16 q9, q13, #6 \n\t"
110 "vaddhn.u16 d6, q14, q8 \n\t"
111 "vshr.u16 q8, q12, #5 \n\t"
112 "vaddhn.u16 d5, q13, q9 \n\t"
113 "vqadd.u8 d6, d6, d0 \n\t" // moved up
114 "vaddhn.u16 d4, q12, q8 \n\t"
115 // intentionally don't calculate alpha
118 "vqadd.u8 d5, d5, d1 \n\t"
119 "vqadd.u8 d4, d4, d2 \n\t"
121 // pack 8888 {d4-d6} to 0565 q10
122 "vshll.u8 q10, d6, #8 \n\t"
123 "vshll.u8 q3, d5, #8 \n\t"
124 "vshll.u8 q2, d4, #8 \n\t"
125 "vsri.u16 q10, q3, #5 \n\t"
126 "vsri.u16 q10, q2, #11 \n\t"
131 "vst1.16 {q10}, [%[keep_dst]] \n\t"
132 : [count] "+r" (count)
133 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
134 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
135 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
140 { // handle count < 8
141 uint16_t* SK_RESTRICT keep_dst = 0;
144 "vmov.u8 d31, #1<<7 \n\t"
145 "mov %[keep_dst], %[dst] \n\t"
147 "tst %[count], #4 \n\t"
149 "vld1.16 {d25}, [%[dst]]! \n\t"
150 "vld1.32 {q1}, [%[src]]! \n\t"
153 "tst %[count], #2 \n\t"
155 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
156 "vld1.32 {d1}, [%[src]]! \n\t"
159 "tst %[count], #1 \n\t"
161 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
162 "vld1.32 {d0[1]}, [%[src]]! \n\t"
165 // unzips achieve the same as a vld4 operation
166 "vuzpq.u16 q0, q1 \n\t"
167 "vuzp.u8 d0, d1 \n\t"
168 "vuzp.u8 d2, d3 \n\t"
169 // expand 0565 q12 to 8888 {d4-d7}
170 "vmovn.u16 d4, q12 \n\t"
171 "vshr.u16 q11, q12, #5 \n\t"
172 "vshr.u16 q10, q12, #6+5 \n\t"
173 "vmovn.u16 d5, q11 \n\t"
174 "vmovn.u16 d6, q10 \n\t"
175 "vshl.u8 d4, d4, #3 \n\t"
176 "vshl.u8 d5, d5, #2 \n\t"
177 "vshl.u8 d6, d6, #3 \n\t"
179 "vmovl.u8 q14, d31 \n\t"
180 "vmovl.u8 q13, d31 \n\t"
181 "vmovl.u8 q12, d31 \n\t"
183 // duplicate in 4/2/1 & 8pix vsns
184 "vmvn.8 d30, d3 \n\t"
185 "vmlal.u8 q14, d30, d6 \n\t"
186 "vmlal.u8 q13, d30, d5 \n\t"
187 "vmlal.u8 q12, d30, d4 \n\t"
188 "vshr.u16 q8, q14, #5 \n\t"
189 "vshr.u16 q9, q13, #6 \n\t"
190 "vaddhn.u16 d6, q14, q8 \n\t"
191 "vshr.u16 q8, q12, #5 \n\t"
192 "vaddhn.u16 d5, q13, q9 \n\t"
193 "vqadd.u8 d6, d6, d0 \n\t" // moved up
194 "vaddhn.u16 d4, q12, q8 \n\t"
195 // intentionally don't calculate alpha
198 "vqadd.u8 d5, d5, d1 \n\t"
199 "vqadd.u8 d4, d4, d2 \n\t"
201 // pack 8888 {d4-d6} to 0565 q10
202 "vshll.u8 q10, d6, #8 \n\t"
203 "vshll.u8 q3, d5, #8 \n\t"
204 "vshll.u8 q2, d4, #8 \n\t"
205 "vsri.u16 q10, q3, #5 \n\t"
206 "vsri.u16 q10, q2, #11 \n\t"
209 "tst %[count], #4 \n\t"
211 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
214 "tst %[count], #2 \n\t"
216 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
219 "tst %[count], #1 \n\t"
221 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
224 : [count] "+r" (count)
225 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
226 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
227 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
234 prod += vdupq_n_u16(128);
235 prod += vshrq_n_u16(prod, 8);
236 return vshrq_n_u16(prod, 8);
239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
240 const SkPMColor* SK_RESTRICT src, int count,
241 U8CPU alpha, int /*x*/, int /*y*/) {
242 SkASSERT(255 > alpha);
244 /* This code implements a Neon version of S32A_D565_Blend. The results have
245 * a few mismatches compared to the original code. These mismatches never
250 uint16x8_t valpha_max, vmask_blue;
254 valpha_max = vmovq_n_u16(255);
255 valpha = vdup_n_u8(alpha);
256 vmask_blue = vmovq_n_u16(SK_B16_MASK);
259 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
260 uint16x8_t vres_a, vres_r, vres_g, vres_b;
264 vdst = vld1q_u16(dst);
265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
267 "vld4.u8 %h[vsrc], [%[src]]!"
268 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
272 register uint8x8_t d0 asm("d0");
273 register uint8x8_t d1 asm("d1");
274 register uint8x8_t d2 asm("d2");
275 register uint8x8_t d3 asm("d3");
278 "vld4.u8 {d0-d3},[%[src]]!;"
279 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
291 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
292 vdst_b = vdst & vmask_blue; // extract blue
293 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
294 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
297 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
298 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
299 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
301 // calc src * src_scale
302 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
303 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
304 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
305 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
308 vres_a = SkDiv255Round_neon8(vres_a);
309 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
311 // add dst * dst_scale to previous result
312 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
313 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
314 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
316 #ifdef S32A_D565_BLEND_EXACT
317 // It is possible to get exact results with this but it is slow,
318 // even slower than C code in some cases
319 vres_r = SkDiv255Round_neon8(vres_r);
320 vres_g = SkDiv255Round_neon8(vres_g);
321 vres_b = SkDiv255Round_neon8(vres_b);
323 vres_r = vrshrq_n_u16(vres_r, 8);
324 vres_g = vrshrq_n_u16(vres_g, 8);
325 vres_b = vrshrq_n_u16(vres_b, 8);
328 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
329 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
332 vst1q_u16(dst, vres_b);
335 } while (count >= 8);
339 while (count-- > 0) {
340 SkPMColor sc = *src++;
343 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
344 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
345 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
346 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
347 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
354 * each dither value is spaced out into byte lanes, and repeated
355 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
358 static const uint8_t gDitherMatrix_Neon[48] = {
359 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
360 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
361 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
362 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
366 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
367 int count, U8CPU alpha, int x, int y)
370 SkASSERT(255 > alpha);
372 // rescale alpha to range 1 - 256
373 int scale = SkAlpha255To256(alpha);
376 /* select row and offset for dither array */
377 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
379 uint8x8_t vdither = vld1_u8(dstart); // load dither values
380 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
382 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
383 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
387 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
388 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
389 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
390 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
392 uint16x8_t vdst_r, vdst_g, vdst_b;
393 int16x8_t vres_r, vres_g, vres_b;
394 int8x8_t vres8_r, vres8_g, vres8_b;
396 // Load source and add dither
398 register uint8x8_t d0 asm("d0");
399 register uint8x8_t d1 asm("d1");
400 register uint8x8_t d2 asm("d2");
401 register uint8x8_t d3 asm("d3");
404 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
405 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
409 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
410 vsrc_r = d2; vsrc_b = d0;
411 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
412 vsrc_r = d0; vsrc_b = d2;
416 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
417 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
418 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
420 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
421 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
422 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
424 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
425 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
426 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
428 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
429 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
430 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
432 // Load dst and unpack
433 vdst = vld1q_u16(dst);
434 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
435 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
436 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
438 // subtract dst from src and widen
439 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
440 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
441 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
443 // multiply diffs by scale and shift
444 vres_r = vmulq_s16(vres_r, vscale);
445 vres_g = vmulq_s16(vres_g, vscale);
446 vres_b = vmulq_s16(vres_b, vscale);
448 vres8_r = vshrn_n_s16(vres_r, 8);
449 vres8_g = vshrn_n_s16(vres_g, 8);
450 vres8_b = vshrn_n_s16(vres_b, 8);
453 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
454 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
455 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
457 // put result into 565 format
458 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
459 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
462 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
468 } while (count >= 8);
473 int scale = SkAlpha255To256(alpha);
476 SkPMColor c = *src++;
479 int dither = DITHER_VALUE(x);
480 int sr = SkGetPackedR32(c);
481 int sg = SkGetPackedG32(c);
482 int sb = SkGetPackedB32(c);
483 sr = SkDITHER_R32To565(sr, dither);
484 sg = SkDITHER_G32To565(sg, dither);
485 sb = SkDITHER_B32To565(sb, dither);
488 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
489 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
490 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
492 } while (--count != 0);
496 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
497 const SkPMColor* SK_RESTRICT src,
498 int count, U8CPU alpha) {
500 SkASSERT(255 == alpha);
504 uint8x8_t alpha_mask;
506 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
507 alpha_mask = vld1_u8(alpha_mask_setup);
509 /* do the NEON unrolled code */
511 while (count >= UNROLL) {
512 uint8x8_t src_raw, dst_raw, dst_final;
513 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
515 /* The two prefetches below may make the code slighlty
516 * slower for small values of count but are worth having
517 * in the general case.
519 __builtin_prefetch(src+32);
520 __builtin_prefetch(dst+32);
523 src_raw = vreinterpret_u8_u32(vld1_u32(src));
525 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
528 /* get and hold the dst too */
529 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
531 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
534 /* 1st and 2nd bits of the unrolling */
536 uint8x8_t dst_cooked;
538 uint8x8_t alpha_narrow;
539 uint16x8_t alpha_wide;
541 /* get the alphas spread out properly */
542 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
543 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
545 /* spread the dest */
546 dst_wide = vmovl_u8(dst_raw);
548 /* alpha mul the dest */
549 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
550 dst_cooked = vshrn_n_u16(dst_wide, 8);
552 /* sum -- ignoring any byte lane overflows */
553 dst_final = vadd_u8(src_raw, dst_cooked);
557 /* the 3rd and 4th bits of our unrolling */
559 uint8x8_t dst_cooked;
561 uint8x8_t alpha_narrow;
562 uint16x8_t alpha_wide;
564 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
565 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
567 /* spread the dest */
568 dst_wide = vmovl_u8(dst_raw_2);
570 /* alpha mul the dest */
571 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
572 dst_cooked = vshrn_n_u16(dst_wide, 8);
574 /* sum -- ignoring any byte lane overflows */
575 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
579 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
581 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
590 /* do any residual iterations */
591 while (--count >= 0) {
592 *dst = SkPMSrcOver(*src, *dst);
599 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
600 const SkPMColor* SK_RESTRICT src,
601 int count, U8CPU alpha) {
602 SkASSERT(255 == alpha);
607 /* Use these to check if src is transparent or opaque */
608 const unsigned int ALPHA_OPAQ = 0xFF000000;
609 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
612 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
613 const SkPMColor* SK_RESTRICT src_temp = src;
615 /* set up the NEON variables */
616 uint8x8_t alpha_mask;
617 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
618 alpha_mask = vld1_u8(alpha_mask_setup);
620 uint8x8_t src_raw, dst_raw, dst_final;
621 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
622 uint8x8_t dst_cooked;
624 uint8x8_t alpha_narrow;
625 uint16x8_t alpha_wide;
627 /* choose the first processing type */
630 if(*src <= ALPHA_TRANS)
632 if(*src >= ALPHA_OPAQ)
640 src_raw = vreinterpret_u8_u32(vld1_u32(src));
641 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
643 /* get and hold the dst too */
644 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
645 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
648 /* get the alphas spread out properly */
649 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
650 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
651 /* we collapsed (255-a)+1 ... */
652 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
654 /* spread the dest */
655 dst_wide = vmovl_u8(dst_raw);
657 /* alpha mul the dest */
658 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
659 dst_cooked = vshrn_n_u16(dst_wide, 8);
661 /* sum -- ignoring any byte lane overflows */
662 dst_final = vadd_u8(src_raw, dst_cooked);
664 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
665 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
666 /* we collapsed (255-a)+1 ... */
667 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
669 /* spread the dest */
670 dst_wide = vmovl_u8(dst_raw_2);
672 /* alpha mul the dest */
673 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
674 dst_cooked = vshrn_n_u16(dst_wide, 8);
676 /* sum -- ignoring any byte lane overflows */
677 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
679 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
680 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
685 /* if 2 of the next pixels aren't between 1 and 254
686 it might make sense to go to the optimized loops */
687 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
690 } while(src < src_end);
695 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
702 /*In this state, we know the current alpha is 0 and
703 we optimize for the next alpha also being zero. */
704 src_temp = src; //so we don't have to increment dst every time
706 if(*(++src) > ALPHA_TRANS)
708 if(*(++src) > ALPHA_TRANS)
710 if(*(++src) > ALPHA_TRANS)
712 if(*(++src) > ALPHA_TRANS)
714 } while(src < src_end);
716 dst += (src - src_temp);
718 /* no longer alpha 0, so determine where to go next. */
721 if(*src >= ALPHA_OPAQ)
727 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
739 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
740 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
741 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
747 if(*src <= ALPHA_TRANS)
753 /* do any residual iterations */
754 src_end += UNROLL + 1; //goto the real end
755 while(src != src_end) {
757 if( *src >= ALPHA_OPAQ ) {
761 *dst = SkPMSrcOver(*src, *dst);
772 /* Neon version of S32_Blend_BlitRow32()
773 * portable version is in src/core/SkBlitRow_D32.cpp
775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
776 const SkPMColor* SK_RESTRICT src,
777 int count, U8CPU alpha) {
778 SkASSERT(alpha <= 255);
780 uint16_t src_scale = SkAlpha255To256(alpha);
781 uint16_t dst_scale = 256 - src_scale;
783 /* run them N at a time through the NEON unit */
784 /* note that each 1 is 4 bytes, each treated exactly the same,
785 * so we can work under that guise. We *do* know that the src&dst
786 * will be 32-bit aligned quantities, so we can specify that on
787 * the load/store ops and do a neon 'reinterpret' to get us to
788 * byte-sized (pun intended) pieces that we widen/multiply/shift
789 * we're limited at 128 bits in the wide ops, which is 8x16bits
790 * or a pair of 32 bit src/dsts.
792 /* we *could* manually unroll this loop so that we load 128 bits
793 * (as a pair of 64s) from each of src and dst, processing them
794 * in pieces. This might give us a little better management of
795 * the memory latency, but my initial attempts here did not
796 * produce an instruction stream that looked all that nice.
799 while (count >= UNROLL) {
800 uint8x8_t src_raw, dst_raw, dst_final;
801 uint16x8_t src_wide, dst_wide;
803 /* get 64 bits of src, widen it, multiply by src_scale */
804 src_raw = vreinterpret_u8_u32(vld1_u32(src));
805 src_wide = vmovl_u8(src_raw);
806 /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */
807 src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale));
810 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
811 dst_wide = vmovl_u8(dst_raw);
813 /* combine add with dst multiply into mul-accumulate */
814 dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale));
816 dst_final = vshrn_n_u16(dst_wide, 8);
817 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
823 /* RBE: well, i don't like how gcc manages src/dst across the above
824 * loop it's constantly calculating src+bias, dst+bias and it only
825 * adjusts the real ones when we leave the loop. Not sure why
826 * it's "hoisting down" (hoisting implies above in my lexicon ;))
827 * the adjustments to src/dst/count, but it does...
828 * (might be SSA-style internal logic...
833 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
838 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
841 } while (--count > 0);
849 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
850 const SkPMColor* SK_RESTRICT src,
851 int count, U8CPU alpha) {
853 SkASSERT(255 >= alpha);
859 unsigned alpha256 = SkAlpha255To256(alpha);
861 // First deal with odd counts
863 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
864 uint16x8_t vdst_wide, vsrc_wide;
868 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
869 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
872 dst_scale = vget_lane_u8(vsrc, 3);
873 dst_scale *= alpha256;
875 dst_scale = 256 - dst_scale;
878 vsrc_wide = vmovl_u8(vsrc);
879 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
882 vdst_wide = vmovl_u8(vdst);
883 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
886 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
888 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
895 uint8x8_t alpha_mask;
896 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
897 alpha_mask = vld1_u8(alpha_mask_setup);
901 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
902 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
904 __builtin_prefetch(src+32);
905 __builtin_prefetch(dst+32);
908 vsrc = vreinterpret_u8_u32(vld1_u32(src));
909 vdst = vreinterpret_u8_u32(vld1_u32(dst));
912 vsrc_scale = vdupq_n_u16(alpha256);
915 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
916 vdst_scale = vmovl_u8(vsrc_alphas);
917 vdst_scale *= vsrc_scale;
918 vdst_scale = vshrq_n_u16(vdst_scale, 8);
919 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
922 vsrc_wide = vmovl_u8(vsrc);
923 vsrc_wide *= vsrc_scale;
926 vdst_wide = vmovl_u8(vdst);
927 vdst_wide *= vdst_scale;
930 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
932 vst1_u32(dst, vreinterpret_u32_u8(vres));
941 ///////////////////////////////////////////////////////////////////////////////
943 #undef DEBUG_OPAQUE_DITHER
945 #if defined(DEBUG_OPAQUE_DITHER)
946 static void showme8(char *str, void *p, int len)
948 static char buf[256];
951 char *pc = (char*) p;
952 sprintf(buf,"%8s:", str);
954 sprintf(tbuf, " %02x", pc[i]);
957 SkDebugf("%s\n", buf);
959 static void showme16(char *str, void *p, int len)
961 static char buf[256];
964 uint16_t *pc = (uint16_t*) p;
965 sprintf(buf,"%8s:", str);
966 len = (len / sizeof(uint16_t)); /* passed as bytes */
968 sprintf(tbuf, " %04x", pc[i]);
971 SkDebugf("%s\n", buf);
975 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
976 const SkPMColor* SK_RESTRICT src,
977 int count, U8CPU alpha, int x, int y) {
978 SkASSERT(255 == alpha);
982 if (count >= UNROLL) {
985 #if defined(DEBUG_OPAQUE_DITHER)
986 uint16_t tmpbuf[UNROLL];
991 uint16_t in_dst[UNROLL];
996 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
997 dbase = vld1_u8(dstart);
1000 uint8x8_t sr, sg, sb, sa, d;
1001 uint16x8_t dst8, scale8, alpha8;
1002 uint16x8_t dst_r, dst_g, dst_b;
1004 #if defined(DEBUG_OPAQUE_DITHER)
1005 /* calculate 8 elements worth into a temp buffer */
1009 SkPMColor* my_src = (SkPMColor*)src;
1010 uint16_t* my_dst = dst;
1013 DITHER_565_SCAN(my_y);
1014 for(i=0;i<UNROLL;i++) {
1015 SkPMColor c = *my_src++;
1018 unsigned a = SkGetPackedA32(c);
1020 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1021 tdv[i] = DITHER_VALUE(my_x);
1023 tap[i] = SkAlpha255To256(a);
1026 unsigned sr = SkGetPackedR32(c);
1027 unsigned sg = SkGetPackedG32(c);
1028 unsigned sb = SkGetPackedB32(c);
1029 sr = SkDITHER_R32_FOR_565(sr, d);
1030 sg = SkDITHER_G32_FOR_565(sg, d);
1031 sb = SkDITHER_B32_FOR_565(sb, d);
1033 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1034 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1035 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1036 // now src and dst expanded are in g:11 r:10 x:1 b:10
1037 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1041 tmpbuf[i] = *my_dst;
1042 ta[i] = tdv[i] = td[i] = 0xbeef;
1044 in_dst[i] = *my_dst;
1051 /* source is in ABGR */
1053 register uint8x8_t d0 asm("d0");
1054 register uint8x8_t d1 asm("d1");
1055 register uint8x8_t d2 asm("d2");
1056 register uint8x8_t d3 asm("d3");
1058 asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1059 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3)
1062 sr = d0; sg = d1; sb = d2; sa = d3;
1065 /* calculate 'd', which will be 0..7 */
1066 /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */
1067 #if defined(SK_BUILD_FOR_ANDROID)
1068 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1069 alpha8 = vaddw_u8(vmovl_u8(sa), vdup_n_u8(1));
1071 alpha8 = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7));
1073 alpha8 = vmulq_u16(alpha8, vmovl_u8(dbase));
1074 d = vshrn_n_u16(alpha8, 8); /* narrowing too */
1076 /* sr = sr - (sr>>5) + d */
1077 /* watching for 8-bit overflow. d is 0..7; risky range of
1078 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1079 * safe as long as we do ((sr-sr>>5) + d) */
1080 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1081 sr = vadd_u8(sr, d);
1083 /* sb = sb - (sb>>5) + d */
1084 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1085 sb = vadd_u8(sb, d);
1087 /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */
1088 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1089 sg = vadd_u8(sg, vshr_n_u8(d,1));
1091 /* need to pick up 8 dst's -- at 16 bits each, 128 bits */
1092 dst8 = vld1q_u16(dst);
1093 dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F));
1094 dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F));
1095 dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */
1099 /* SkAlpha255To256() semantic a+1 vs a+a>>7 */
1100 /* originally 255-sa + 1 */
1101 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1103 scale8 = vsubw_u8(vdupq_n_u16(255), sa);
1104 scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7));
1108 /* combine the addq and mul, save 3 insns */
1109 scale8 = vshrq_n_u16(scale8, 3);
1110 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1111 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1112 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1114 /* known correct, but +3 insns over above */
1115 scale8 = vshrq_n_u16(scale8, 3);
1116 dst_b = vmulq_u16(dst_b, scale8);
1117 dst_g = vmulq_u16(dst_g, scale8);
1118 dst_r = vmulq_u16(dst_r, scale8);
1121 /* NB: vshll widens, need to preserve those bits */
1122 dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2));
1123 dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3));
1124 dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2));
1127 /* repack to store */
1128 dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F));
1129 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1130 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1132 vst1q_u16(dst, dst8);
1134 #if defined(DEBUG_OPAQUE_DITHER)
1135 /* verify my 8 elements match the temp buffer */
1138 static int invocation;
1140 for (i=0;i<UNROLL;i++)
1141 if (tmpbuf[i] != dst[i]) bad=1;
1143 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1144 invocation, offset);
1145 SkDebugf(" alpha 0x%x\n", alpha);
1146 for (i=0;i<UNROLL;i++)
1147 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1148 i, ((tmpbuf[i] != dst[i])?"BAD":"got"),
1149 dst[i], tmpbuf[i], in_dst[i], src[i], td[i], tdv[i], tap[i], ta[i]);
1151 showme16("alpha8", &alpha8, sizeof(alpha8));
1152 showme16("scale8", &scale8, sizeof(scale8));
1153 showme8("d", &d, sizeof(d));
1154 showme16("dst8", &dst8, sizeof(dst8));
1155 showme16("dst_b", &dst_b, sizeof(dst_b));
1156 showme16("dst_g", &dst_g, sizeof(dst_g));
1157 showme16("dst_r", &dst_r, sizeof(dst_r));
1158 showme8("sb", &sb, sizeof(sb));
1159 showme8("sg", &sg, sizeof(sg));
1160 showme8("sr", &sr, sizeof(sr));
1173 /* skip x += UNROLL, since it's unchanged mod-4 */
1174 } while (count >= UNROLL);
1182 SkPMColor c = *src++;
1185 unsigned a = SkGetPackedA32(c);
1187 // dither and alpha are just temporary variables to work-around
1189 unsigned dither = DITHER_VALUE(x);
1190 unsigned alpha = SkAlpha255To256(a);
1191 int d = SkAlphaMul(dither, alpha);
1193 unsigned sr = SkGetPackedR32(c);
1194 unsigned sg = SkGetPackedG32(c);
1195 unsigned sb = SkGetPackedB32(c);
1196 sr = SkDITHER_R32_FOR_565(sr, d);
1197 sg = SkDITHER_G32_FOR_565(sg, d);
1198 sb = SkDITHER_B32_FOR_565(sb, d);
1200 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1201 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1202 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1203 // now src and dst expanded are in g:11 r:10 x:1 b:10
1204 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1208 } while (--count != 0);
1212 ///////////////////////////////////////////////////////////////////////////////
1214 #undef DEBUG_S32_OPAQUE_DITHER
1216 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1217 const SkPMColor* SK_RESTRICT src,
1218 int count, U8CPU alpha, int x, int y) {
1219 SkASSERT(255 == alpha);
1222 if (count >= UNROLL) {
1224 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1225 d = vld1_u8(dstart);
1227 while (count >= UNROLL) {
1228 uint8x8_t sr, sg, sb;
1229 uint16x8_t dr, dg, db;
1233 register uint8x8_t d0 asm("d0");
1234 register uint8x8_t d1 asm("d1");
1235 register uint8x8_t d2 asm("d2");
1236 register uint8x8_t d3 asm("d3");
1239 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1240 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1244 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1246 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1250 /* XXX: if we want to prefetch, hide it in the above asm()
1251 * using the gcc __builtin_prefetch(), the prefetch will
1252 * fall to the bottom of the loop -- it won't stick up
1253 * at the top of the loop, just after the vld4.
1256 // sr = sr - (sr>>5) + d
1257 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1258 dr = vaddl_u8(sr, d);
1260 // sb = sb - (sb>>5) + d
1261 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1262 db = vaddl_u8(sb, d);
1264 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1265 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1266 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1268 // pack high bits of each into 565 format (rgb, b is lsb)
1269 dst8 = vshrq_n_u16(db, 3);
1270 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1271 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1274 vst1q_u16(dst, dst8);
1276 #if defined(DEBUG_S32_OPAQUE_DITHER)
1277 // always good to know if we generated good results
1279 int i, myx = x, myy = y;
1280 DITHER_565_SCAN(myy);
1281 for (i=0;i<UNROLL;i++) {
1282 // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1283 SkPMColor c = src[i-8];
1284 unsigned dither = DITHER_VALUE(myx);
1285 uint16_t val = SkDitherRGB32To565(c, dither);
1286 if (val != dst[i]) {
1287 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1288 c, dither, val, dst[i], dstart[i]);
1296 // we don't need to increment src as the asm above has already done it
1298 x += UNROLL; // probably superfluous
1307 SkPMColor c = *src++;
1309 SkASSERT(SkGetPackedA32(c) == 255);
1311 unsigned dither = DITHER_VALUE(x);
1312 *dst++ = SkDitherRGB32To565(c, dither);
1314 } while (--count != 0);
1318 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1326 memcpy(dst, src, count * sizeof(SkPMColor));
1331 unsigned colorA = SkGetPackedA32(color);
1332 if (255 == colorA) {
1333 sk_memset32(dst, color, count);
1335 unsigned scale = 256 - SkAlpha255To256(colorA);
1338 // at the end of this assembly, count will have been decremented
1339 // to a negative value. That is, if count mod 8 = x, it will be
1340 // -8 +x coming out.
1344 "vdup.32 q0, %[color] \n\t"
1348 // scale numerical interval [0-255], so load as 8 bits
1349 "vdup.8 d2, %[scale] \n\t"
1353 "subs %[count], %[count], #8 \n\t"
1357 "Loop_Color32: \n\t"
1359 // load src color, 8 pixels, 4 64 bit registers
1360 // (and increment src).
1361 "vld1.32 {d4-d7}, [%[src]]! \n\t"
1365 // multiply long by scale, 64 bits at a time,
1366 // destination into a 128 bit register.
1367 "vmull.u8 q4, d4, d2 \n\t"
1368 "vmull.u8 q5, d5, d2 \n\t"
1369 "vmull.u8 q6, d6, d2 \n\t"
1370 "vmull.u8 q7, d7, d2 \n\t"
1372 // shift the 128 bit registers, containing the 16
1373 // bit scaled values back to 8 bits, narrowing the
1374 // results to 64 bit registers.
1375 "vshrn.i16 d8, q4, #8 \n\t"
1376 "vshrn.i16 d9, q5, #8 \n\t"
1377 "vshrn.i16 d10, q6, #8 \n\t"
1378 "vshrn.i16 d11, q7, #8 \n\t"
1380 // adding back the color, using 128 bit registers.
1381 "vadd.i8 q6, q4, q0 \n\t"
1382 "vadd.i8 q7, q5, q0 \n\t"
1384 // store back the 8 calculated pixels (2 128 bit
1385 // registers), and increment dst.
1386 "vst1.32 {d12-d15}, [%[dst]]! \n\t"
1388 "subs %[count], %[count], #8 \n\t"
1389 "bge Loop_Color32 \n\t"
1390 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
1391 : [color] "r" (color), [scale] "r" (scale)
1393 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
1394 "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
1396 // At this point, if we went through the inline assembly, count is
1397 // a negative value:
1398 // if the value is -8, there is no pixel left to process.
1399 // if the value is -7, there is one pixel left to process
1401 // And'ing it with 7 will give us the number of pixels
1403 count = count & 0x7;
1407 *dst = color + SkAlphaMulQ(*src, scale);
1415 ///////////////////////////////////////////////////////////////////////////////
1417 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1419 // NOTE: For the S32_D565_Blend function below, we don't have a special
1420 // version that assumes that each source pixel is opaque. But our
1421 // S32A is still faster than the default, so use it.
1422 S32_D565_Opaque_neon,
1423 S32A_D565_Blend_neon, // really S32_D565_Blend
1424 S32A_D565_Opaque_neon,
1425 S32A_D565_Blend_neon,
1428 S32_D565_Opaque_Dither_neon,
1429 S32_D565_Blend_Dither_neon,
1430 S32A_D565_Opaque_Dither_neon,
1431 NULL, // S32A_D565_Blend_Dither
1434 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1435 NULL, // S32_Opaque,
1436 S32_Blend_BlitRow32_neon, // S32_Blend,
1438 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1439 * value and attempts to optimize accordingly. The optimization is
1440 * sensitive to the source content and is not a win in all cases. For
1441 * example, if there are a lot of transitions between the alpha states,
1442 * the performance will almost certainly be worse. However, for many
1443 * common cases the performance is equivalent or better than the standard
1444 * case where we do not inspect the src alpha.
1446 #if SK_A32_SHIFT == 24
1447 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1448 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1450 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1452 S32A_Blend_BlitRow32_neon // S32A_Blend