2 * Copyright 2012 The Android Open Source Project
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
8 #include "SkBlitRow_opts_arm_neon.h"
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorPriv.h"
14 #include "SkMathPriv.h"
17 #include "SkColor_opts_neon.h"
20 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
21 const SkPMColor* SK_RESTRICT src, int count,
22 U8CPU alpha, int /*x*/, int /*y*/) {
23 SkASSERT(255 == alpha);
30 vsrc = vld4_u8((uint8_t*)src);
33 vdst = SkPixel32ToPixel16_neon8(vsrc);
38 // Prepare next iteration
48 *dst = SkPixel32ToPixel16_ToU16(c);
54 void S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
55 const SkPMColor* SK_RESTRICT src, int count,
56 U8CPU alpha, int /*x*/, int /*y*/) {
57 SkASSERT(255 > alpha);
59 uint16x8_t vmask_blue, vscale;
62 vscale = vdupq_n_u16(SkAlpha255To256(alpha));
63 vmask_blue = vmovq_n_u16(0x1F);
66 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
67 uint16x8_t vres_r, vres_g, vres_b;
68 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
72 register uint8x8_t d0 asm("d0");
73 register uint8x8_t d1 asm("d1");
74 register uint8x8_t d2 asm("d2");
75 register uint8x8_t d3 asm("d3");
78 "vld4.8 {d0-d3},[%[src]]!"
79 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
83 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
84 vsrc_r = d2; vsrc_b = d0;
85 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
86 vsrc_r = d0; vsrc_b = d2;
90 // Load and unpack dst
91 vdst = vld1q_u16(dst);
92 vdst_g = vshlq_n_u16(vdst, 5); // shift green to top of lanes
93 vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
94 vdst_r = vshrq_n_u16(vdst, 6+5); // extract red
95 vdst_g = vshrq_n_u16(vdst_g, 5+5); // extract green
98 vsrc_r = vshr_n_u8(vsrc_r, 3); // shift red to 565 range
99 vsrc_g = vshr_n_u8(vsrc_g, 2); // shift green to 565 range
100 vsrc_b = vshr_n_u8(vsrc_b, 3); // shift blue to 565 range
103 vres_r = vmovl_u8(vsrc_r) - vdst_r;
104 vres_g = vmovl_u8(vsrc_g) - vdst_g;
105 vres_b = vmovl_u8(vsrc_b) - vdst_b;
107 vres_r = vshrq_n_u16(vres_r * vscale, 8);
108 vres_g = vshrq_n_u16(vres_g * vscale, 8);
109 vres_b = vshrq_n_u16(vres_b * vscale, 8);
116 vres_b = vsliq_n_u16(vres_b, vres_g, 5); // insert green into blue
117 vres_b = vsliq_n_u16(vres_b, vres_r, 6+5); // insert red into green/blue
120 vst1q_u16(dst, vres_b);
125 int scale = SkAlpha255To256(alpha);
127 SkPMColor c = *src++;
130 *dst++ = SkPackRGB16(
131 SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
132 SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
133 SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
134 } while (--count != 0);
138 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
139 const SkPMColor* SK_RESTRICT src, int count,
140 U8CPU alpha, int /*x*/, int /*y*/) {
141 SkASSERT(255 == alpha);
144 uint16_t* SK_RESTRICT keep_dst = 0;
147 "ands ip, %[count], #7 \n\t"
148 "vmov.u8 d31, #1<<7 \n\t"
149 "vld1.16 {q12}, [%[dst]] \n\t"
150 "vld4.8 {d0-d3}, [%[src]] \n\t"
151 // Thumb does not support the standard ARM conditional
152 // instructions but instead requires the 'it' instruction
153 // to signal conditional execution
156 "mov %[keep_dst], %[dst] \n\t"
158 "add %[src], %[src], ip, LSL#2 \n\t"
159 "add %[dst], %[dst], ip, LSL#1 \n\t"
160 "subs %[count], %[count], ip \n\t"
165 "vld1.16 {q12}, [%[dst]]! \n\t"
166 "vld4.8 {d0-d3}, [%[src]]! \n\t"
167 "vst1.16 {q10}, [%[keep_dst]] \n\t"
168 "sub %[keep_dst], %[dst], #8*2 \n\t"
169 "subs %[count], %[count], #8 \n\t"
171 "pld [%[dst],#32] \n\t"
172 // expand 0565 q12 to 8888 {d4-d7}
173 "vmovn.u16 d4, q12 \n\t"
174 "vshr.u16 q11, q12, #5 \n\t"
175 "vshr.u16 q10, q12, #6+5 \n\t"
176 "vmovn.u16 d5, q11 \n\t"
177 "vmovn.u16 d6, q10 \n\t"
178 "vshl.u8 d4, d4, #3 \n\t"
179 "vshl.u8 d5, d5, #2 \n\t"
180 "vshl.u8 d6, d6, #3 \n\t"
182 "vmovl.u8 q14, d31 \n\t"
183 "vmovl.u8 q13, d31 \n\t"
184 "vmovl.u8 q12, d31 \n\t"
186 // duplicate in 4/2/1 & 8pix vsns
187 "vmvn.8 d30, d3 \n\t"
188 "vmlal.u8 q14, d30, d6 \n\t"
189 "vmlal.u8 q13, d30, d5 \n\t"
190 "vmlal.u8 q12, d30, d4 \n\t"
191 "vshr.u16 q8, q14, #5 \n\t"
192 "vshr.u16 q9, q13, #6 \n\t"
193 "vaddhn.u16 d6, q14, q8 \n\t"
194 "vshr.u16 q8, q12, #5 \n\t"
195 "vaddhn.u16 d5, q13, q9 \n\t"
196 "vqadd.u8 d6, d6, d0 \n\t" // moved up
197 "vaddhn.u16 d4, q12, q8 \n\t"
198 // intentionally don't calculate alpha
201 "vqadd.u8 d5, d5, d1 \n\t"
202 "vqadd.u8 d4, d4, d2 \n\t"
204 // pack 8888 {d4-d6} to 0565 q10
205 "vshll.u8 q10, d6, #8 \n\t"
206 "vshll.u8 q3, d5, #8 \n\t"
207 "vshll.u8 q2, d4, #8 \n\t"
208 "vsri.u16 q10, q3, #5 \n\t"
209 "vsri.u16 q10, q2, #11 \n\t"
214 "vst1.16 {q10}, [%[keep_dst]] \n\t"
215 : [count] "+r" (count)
216 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
217 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
218 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
223 { // handle count < 8
224 uint16_t* SK_RESTRICT keep_dst = 0;
227 "vmov.u8 d31, #1<<7 \n\t"
228 "mov %[keep_dst], %[dst] \n\t"
230 "tst %[count], #4 \n\t"
232 "vld1.16 {d25}, [%[dst]]! \n\t"
233 "vld1.32 {q1}, [%[src]]! \n\t"
236 "tst %[count], #2 \n\t"
238 "vld1.32 {d24[1]}, [%[dst]]! \n\t"
239 "vld1.32 {d1}, [%[src]]! \n\t"
242 "tst %[count], #1 \n\t"
244 "vld1.16 {d24[1]}, [%[dst]]! \n\t"
245 "vld1.32 {d0[1]}, [%[src]]! \n\t"
248 // unzips achieve the same as a vld4 operation
249 "vuzpq.u16 q0, q1 \n\t"
250 "vuzp.u8 d0, d1 \n\t"
251 "vuzp.u8 d2, d3 \n\t"
252 // expand 0565 q12 to 8888 {d4-d7}
253 "vmovn.u16 d4, q12 \n\t"
254 "vshr.u16 q11, q12, #5 \n\t"
255 "vshr.u16 q10, q12, #6+5 \n\t"
256 "vmovn.u16 d5, q11 \n\t"
257 "vmovn.u16 d6, q10 \n\t"
258 "vshl.u8 d4, d4, #3 \n\t"
259 "vshl.u8 d5, d5, #2 \n\t"
260 "vshl.u8 d6, d6, #3 \n\t"
262 "vmovl.u8 q14, d31 \n\t"
263 "vmovl.u8 q13, d31 \n\t"
264 "vmovl.u8 q12, d31 \n\t"
266 // duplicate in 4/2/1 & 8pix vsns
267 "vmvn.8 d30, d3 \n\t"
268 "vmlal.u8 q14, d30, d6 \n\t"
269 "vmlal.u8 q13, d30, d5 \n\t"
270 "vmlal.u8 q12, d30, d4 \n\t"
271 "vshr.u16 q8, q14, #5 \n\t"
272 "vshr.u16 q9, q13, #6 \n\t"
273 "vaddhn.u16 d6, q14, q8 \n\t"
274 "vshr.u16 q8, q12, #5 \n\t"
275 "vaddhn.u16 d5, q13, q9 \n\t"
276 "vqadd.u8 d6, d6, d0 \n\t" // moved up
277 "vaddhn.u16 d4, q12, q8 \n\t"
278 // intentionally don't calculate alpha
281 "vqadd.u8 d5, d5, d1 \n\t"
282 "vqadd.u8 d4, d4, d2 \n\t"
284 // pack 8888 {d4-d6} to 0565 q10
285 "vshll.u8 q10, d6, #8 \n\t"
286 "vshll.u8 q3, d5, #8 \n\t"
287 "vshll.u8 q2, d4, #8 \n\t"
288 "vsri.u16 q10, q3, #5 \n\t"
289 "vsri.u16 q10, q2, #11 \n\t"
292 "tst %[count], #4 \n\t"
294 "vst1.16 {d21}, [%[keep_dst]]! \n\t"
297 "tst %[count], #2 \n\t"
299 "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t"
302 "tst %[count], #1 \n\t"
304 "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t"
307 : [count] "+r" (count)
308 : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
309 : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
310 "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
316 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
317 prod += vdupq_n_u16(128);
318 prod += vshrq_n_u16(prod, 8);
319 return vshrq_n_u16(prod, 8);
322 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
323 const SkPMColor* SK_RESTRICT src, int count,
324 U8CPU alpha, int /*x*/, int /*y*/) {
325 SkASSERT(255 > alpha);
327 /* This code implements a Neon version of S32A_D565_Blend. The results have
328 * a few mismatches compared to the original code. These mismatches never
333 uint16x8_t valpha_max, vmask_blue;
337 valpha_max = vmovq_n_u16(255);
338 valpha = vdup_n_u8(alpha);
339 vmask_blue = vmovq_n_u16(SK_B16_MASK);
342 uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
343 uint16x8_t vres_a, vres_r, vres_g, vres_b;
347 vdst = vld1q_u16(dst);
348 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
350 "vld4.u8 %h[vsrc], [%[src]]!"
351 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
355 register uint8x8_t d0 asm("d0");
356 register uint8x8_t d1 asm("d1");
357 register uint8x8_t d2 asm("d2");
358 register uint8x8_t d3 asm("d3");
361 "vld4.u8 {d0-d3},[%[src]]!;"
362 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
374 vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes
375 vdst_b = vdst & vmask_blue; // extract blue
376 vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red
377 vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
380 vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
381 vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
382 vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
384 // calc src * src_scale
385 vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
386 vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
387 vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
388 vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
391 vres_a = SkDiv255Round_neon8(vres_a);
392 vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
394 // add dst * dst_scale to previous result
395 vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
396 vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
397 vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
399 #ifdef S32A_D565_BLEND_EXACT
400 // It is possible to get exact results with this but it is slow,
401 // even slower than C code in some cases
402 vres_r = SkDiv255Round_neon8(vres_r);
403 vres_g = SkDiv255Round_neon8(vres_g);
404 vres_b = SkDiv255Round_neon8(vres_b);
406 vres_r = vrshrq_n_u16(vres_r, 8);
407 vres_g = vrshrq_n_u16(vres_g, 8);
408 vres_b = vrshrq_n_u16(vres_b, 8);
411 vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
412 vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
415 vst1q_u16(dst, vres_b);
418 } while (count >= 8);
422 while (count-- > 0) {
423 SkPMColor sc = *src++;
426 unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
427 unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
428 unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
429 unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
430 *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
436 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
437 * each dither value is spaced out into byte lanes, and repeated
438 * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
441 static const uint8_t gDitherMatrix_Neon[48] = {
442 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
443 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
444 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
445 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
449 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
450 int count, U8CPU alpha, int x, int y)
453 SkASSERT(255 > alpha);
455 // rescale alpha to range 1 - 256
456 int scale = SkAlpha255To256(alpha);
459 /* select row and offset for dither array */
460 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
462 uint8x8_t vdither = vld1_u8(dstart); // load dither values
463 uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
465 int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg
466 uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask
470 uint8x8_t vsrc_r, vsrc_g, vsrc_b;
471 uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
472 uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
473 uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
475 uint16x8_t vdst_r, vdst_g, vdst_b;
476 int16x8_t vres_r, vres_g, vres_b;
477 int8x8_t vres8_r, vres8_g, vres8_b;
479 // Load source and add dither
481 register uint8x8_t d0 asm("d0");
482 register uint8x8_t d1 asm("d1");
483 register uint8x8_t d2 asm("d2");
484 register uint8x8_t d3 asm("d3");
487 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
488 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
492 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
493 vsrc_r = d2; vsrc_b = d0;
494 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
495 vsrc_r = d0; vsrc_b = d2;
499 vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
500 vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
501 vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
503 vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
504 vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen
505 vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen
507 vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result
508 vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result
509 vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result
511 vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
512 vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
513 vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
515 // Load dst and unpack
516 vdst = vld1q_u16(dst);
517 vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green
518 vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
519 vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue
521 // subtract dst from src and widen
522 vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
523 vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
524 vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
526 // multiply diffs by scale and shift
527 vres_r = vmulq_s16(vres_r, vscale);
528 vres_g = vmulq_s16(vres_g, vscale);
529 vres_b = vmulq_s16(vres_b, vscale);
531 vres8_r = vshrn_n_s16(vres_r, 8);
532 vres8_g = vshrn_n_s16(vres_g, 8);
533 vres8_b = vshrn_n_s16(vres_b, 8);
536 vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
537 vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
538 vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
540 // put result into 565 format
541 vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue
542 vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
545 vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
551 } while (count >= 8);
556 int scale = SkAlpha255To256(alpha);
559 SkPMColor c = *src++;
562 int dither = DITHER_VALUE(x);
563 int sr = SkGetPackedR32(c);
564 int sg = SkGetPackedG32(c);
565 int sb = SkGetPackedB32(c);
566 sr = SkDITHER_R32To565(sr, dither);
567 sg = SkDITHER_G32To565(sg, dither);
568 sb = SkDITHER_B32To565(sb, dither);
571 *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
572 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
573 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
575 } while (--count != 0);
579 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
580 const SkPMColor* SK_RESTRICT src,
581 int count, U8CPU alpha) {
583 SkASSERT(255 == alpha);
587 uint8x8_t alpha_mask;
589 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
590 alpha_mask = vld1_u8(alpha_mask_setup);
592 /* do the NEON unrolled code */
594 while (count >= UNROLL) {
595 uint8x8_t src_raw, dst_raw, dst_final;
596 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
598 /* The two prefetches below may make the code slighlty
599 * slower for small values of count but are worth having
600 * in the general case.
602 __builtin_prefetch(src+32);
603 __builtin_prefetch(dst+32);
606 src_raw = vreinterpret_u8_u32(vld1_u32(src));
608 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
611 /* get and hold the dst too */
612 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
614 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
617 /* 1st and 2nd bits of the unrolling */
619 uint8x8_t dst_cooked;
621 uint8x8_t alpha_narrow;
622 uint16x8_t alpha_wide;
624 /* get the alphas spread out properly */
625 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
626 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
628 /* spread the dest */
629 dst_wide = vmovl_u8(dst_raw);
631 /* alpha mul the dest */
632 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
633 dst_cooked = vshrn_n_u16(dst_wide, 8);
635 /* sum -- ignoring any byte lane overflows */
636 dst_final = vadd_u8(src_raw, dst_cooked);
640 /* the 3rd and 4th bits of our unrolling */
642 uint8x8_t dst_cooked;
644 uint8x8_t alpha_narrow;
645 uint16x8_t alpha_wide;
647 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
648 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
650 /* spread the dest */
651 dst_wide = vmovl_u8(dst_raw_2);
653 /* alpha mul the dest */
654 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
655 dst_cooked = vshrn_n_u16(dst_wide, 8);
657 /* sum -- ignoring any byte lane overflows */
658 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
662 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
664 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
673 /* do any residual iterations */
674 while (--count >= 0) {
675 *dst = SkPMSrcOver(*src, *dst);
682 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
683 const SkPMColor* SK_RESTRICT src,
684 int count, U8CPU alpha) {
685 SkASSERT(255 == alpha);
690 /* Use these to check if src is transparent or opaque */
691 const unsigned int ALPHA_OPAQ = 0xFF000000;
692 const unsigned int ALPHA_TRANS = 0x00FFFFFF;
695 const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
696 const SkPMColor* SK_RESTRICT src_temp = src;
698 /* set up the NEON variables */
699 uint8x8_t alpha_mask;
700 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
701 alpha_mask = vld1_u8(alpha_mask_setup);
703 uint8x8_t src_raw, dst_raw, dst_final;
704 uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
705 uint8x8_t dst_cooked;
707 uint8x8_t alpha_narrow;
708 uint16x8_t alpha_wide;
710 /* choose the first processing type */
713 if(*src <= ALPHA_TRANS)
715 if(*src >= ALPHA_OPAQ)
723 src_raw = vreinterpret_u8_u32(vld1_u32(src));
724 src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
726 /* get and hold the dst too */
727 dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
728 dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
731 /* get the alphas spread out properly */
732 alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
733 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
734 /* we collapsed (255-a)+1 ... */
735 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
737 /* spread the dest */
738 dst_wide = vmovl_u8(dst_raw);
740 /* alpha mul the dest */
741 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
742 dst_cooked = vshrn_n_u16(dst_wide, 8);
744 /* sum -- ignoring any byte lane overflows */
745 dst_final = vadd_u8(src_raw, dst_cooked);
747 alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
748 /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
749 /* we collapsed (255-a)+1 ... */
750 alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
752 /* spread the dest */
753 dst_wide = vmovl_u8(dst_raw_2);
755 /* alpha mul the dest */
756 dst_wide = vmulq_u16 (dst_wide, alpha_wide);
757 dst_cooked = vshrn_n_u16(dst_wide, 8);
759 /* sum -- ignoring any byte lane overflows */
760 dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
762 vst1_u32(dst, vreinterpret_u32_u8(dst_final));
763 vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
768 /* if 2 of the next pixels aren't between 1 and 254
769 it might make sense to go to the optimized loops */
770 if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
773 } while(src < src_end);
778 if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
785 /*In this state, we know the current alpha is 0 and
786 we optimize for the next alpha also being zero. */
787 src_temp = src; //so we don't have to increment dst every time
789 if(*(++src) > ALPHA_TRANS)
791 if(*(++src) > ALPHA_TRANS)
793 if(*(++src) > ALPHA_TRANS)
795 if(*(++src) > ALPHA_TRANS)
797 } while(src < src_end);
799 dst += (src - src_temp);
801 /* no longer alpha 0, so determine where to go next. */
804 if(*src >= ALPHA_OPAQ)
810 while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
822 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
823 if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
824 if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
830 if(*src <= ALPHA_TRANS)
836 /* do any residual iterations */
837 src_end += UNROLL + 1; //goto the real end
838 while(src != src_end) {
840 if( *src >= ALPHA_OPAQ ) {
844 *dst = SkPMSrcOver(*src, *dst);
855 /* Neon version of S32_Blend_BlitRow32()
856 * portable version is in src/core/SkBlitRow_D32.cpp
858 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
859 const SkPMColor* SK_RESTRICT src,
860 int count, U8CPU alpha) {
861 SkASSERT(alpha <= 255);
867 uint16_t src_scale = SkAlpha255To256(alpha);
868 uint16_t dst_scale = 256 - src_scale;
871 uint8x8_t vsrc, vdst, vres;
872 uint16x8_t vsrc_wide, vdst_wide;
874 /* These commented prefetches are a big win for count
875 * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
876 * They also hurt a little (<5%) on an A15
878 //__builtin_prefetch(src+32);
879 //__builtin_prefetch(dst+32);
882 vsrc = vreinterpret_u8_u32(vld1_u32(src));
883 vdst = vreinterpret_u8_u32(vld1_u32(dst));
886 vsrc_wide = vmovl_u8(vsrc);
887 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
890 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
893 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
896 vst1_u32(dst, vreinterpret_u32_u8(vres));
904 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
905 uint16x8_t vsrc_wide, vdst_wide;
908 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
909 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
912 vsrc_wide = vmovl_u8(vsrc);
913 vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
914 vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
915 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
918 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
922 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
923 const SkPMColor* SK_RESTRICT src,
924 int count, U8CPU alpha) {
926 SkASSERT(255 >= alpha);
932 unsigned alpha256 = SkAlpha255To256(alpha);
934 // First deal with odd counts
936 uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
937 uint16x8_t vdst_wide, vsrc_wide;
941 vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
942 vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
945 dst_scale = vget_lane_u8(vsrc, 3);
946 dst_scale *= alpha256;
948 dst_scale = 256 - dst_scale;
951 vsrc_wide = vmovl_u8(vsrc);
952 vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
955 vdst_wide = vmovl_u8(vdst);
956 vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
959 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
961 vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
968 uint8x8_t alpha_mask;
969 static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
970 alpha_mask = vld1_u8(alpha_mask_setup);
974 uint8x8_t vsrc, vdst, vres, vsrc_alphas;
975 uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
977 __builtin_prefetch(src+32);
978 __builtin_prefetch(dst+32);
981 vsrc = vreinterpret_u8_u32(vld1_u32(src));
982 vdst = vreinterpret_u8_u32(vld1_u32(dst));
985 vsrc_scale = vdupq_n_u16(alpha256);
988 vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
989 vdst_scale = vmovl_u8(vsrc_alphas);
990 vdst_scale *= vsrc_scale;
991 vdst_scale = vshrq_n_u16(vdst_scale, 8);
992 vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
995 vsrc_wide = vmovl_u8(vsrc);
996 vsrc_wide *= vsrc_scale;
999 vdst_wide = vmovl_u8(vdst);
1000 vdst_wide *= vdst_scale;
1003 vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
1005 vst1_u32(dst, vreinterpret_u32_u8(vres));
1014 ///////////////////////////////////////////////////////////////////////////////
1016 #undef DEBUG_OPAQUE_DITHER
1018 #if defined(DEBUG_OPAQUE_DITHER)
1019 static void showme8(char *str, void *p, int len)
1021 static char buf[256];
1024 char *pc = (char*) p;
1025 sprintf(buf,"%8s:", str);
1026 for(i=0;i<len;i++) {
1027 sprintf(tbuf, " %02x", pc[i]);
1030 SkDebugf("%s\n", buf);
1032 static void showme16(char *str, void *p, int len)
1034 static char buf[256];
1037 uint16_t *pc = (uint16_t*) p;
1038 sprintf(buf,"%8s:", str);
1039 len = (len / sizeof(uint16_t)); /* passed as bytes */
1040 for(i=0;i<len;i++) {
1041 sprintf(tbuf, " %04x", pc[i]);
1044 SkDebugf("%s\n", buf);
1048 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1049 const SkPMColor* SK_RESTRICT src,
1050 int count, U8CPU alpha, int x, int y) {
1051 SkASSERT(255 == alpha);
1055 if (count >= UNROLL) {
1057 #if defined(DEBUG_OPAQUE_DITHER)
1058 uint16_t tmpbuf[UNROLL];
1063 uint16_t in_dst[UNROLL];
1069 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1070 dbase = vld1_u8(dstart);
1073 uint8x8_t sr, sg, sb, sa, d;
1074 uint16x8_t dst8, scale8, alpha8;
1075 uint16x8_t dst_r, dst_g, dst_b;
1077 #if defined(DEBUG_OPAQUE_DITHER)
1078 // calculate 8 elements worth into a temp buffer
1082 SkPMColor* my_src = (SkPMColor*)src;
1083 uint16_t* my_dst = dst;
1086 DITHER_565_SCAN(my_y);
1087 for(i = 0; i < UNROLL; i++) {
1088 SkPMColor c = *my_src++;
1091 unsigned a = SkGetPackedA32(c);
1093 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
1094 tdv[i] = DITHER_VALUE(my_x);
1096 tap[i] = SkAlpha255To256(a);
1099 unsigned sr = SkGetPackedR32(c);
1100 unsigned sg = SkGetPackedG32(c);
1101 unsigned sb = SkGetPackedB32(c);
1102 sr = SkDITHER_R32_FOR_565(sr, d);
1103 sg = SkDITHER_G32_FOR_565(sg, d);
1104 sb = SkDITHER_B32_FOR_565(sb, d);
1106 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1107 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
1108 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1109 // now src and dst expanded are in g:11 r:10 x:1 b:10
1110 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1113 tmpbuf[i] = *my_dst;
1114 ta[i] = tdv[i] = td[i] = 0xbeef;
1116 in_dst[i] = *my_dst;
1125 register uint8x8_t d0 asm("d0");
1126 register uint8x8_t d1 asm("d1");
1127 register uint8x8_t d2 asm("d2");
1128 register uint8x8_t d3 asm("d3");
1130 asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1131 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1134 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1135 sr = d2; sg = d1; sb = d0; sa = d3;
1136 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1137 sr = d0; sg = d1; sb = d2; sa = d3;
1141 /* calculate 'd', which will be 0..7
1142 * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1144 alpha8 = vmovl_u8(dbase);
1145 alpha8 = vmlal_u8(alpha8, sa, dbase);
1146 d = vshrn_n_u16(alpha8, 8); // narrowing too
1148 // sr = sr - (sr>>5) + d
1149 /* watching for 8-bit overflow. d is 0..7; risky range of
1150 * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1151 * safe as long as we do ((sr-sr>>5) + d)
1153 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1154 sr = vadd_u8(sr, d);
1156 // sb = sb - (sb>>5) + d
1157 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1158 sb = vadd_u8(sb, d);
1160 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1161 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1162 sg = vadd_u8(sg, vshr_n_u8(d,1));
1164 // need to pick up 8 dst's -- at 16 bits each, 128 bits
1165 dst8 = vld1q_u16(dst);
1166 dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1167 dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1168 dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits
1171 scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1173 // combine the addq and mul, save 3 insns
1174 scale8 = vshrq_n_u16(scale8, 3);
1175 dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1176 dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1177 dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1180 dst8 = vshrq_n_u16(dst_b, 5);
1181 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1182 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1184 vst1q_u16(dst, dst8);
1186 #if defined(DEBUG_OPAQUE_DITHER)
1187 // verify my 8 elements match the temp buffer
1190 static int invocation;
1192 for (i = 0; i < UNROLL; i++) {
1193 if (tmpbuf[i] != dst[i]) {
1198 SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
1199 invocation, offset);
1200 SkDebugf(" alpha 0x%x\n", alpha);
1201 for (i = 0; i < UNROLL; i++)
1202 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
1203 i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
1204 in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
1206 showme16("alpha8", &alpha8, sizeof(alpha8));
1207 showme16("scale8", &scale8, sizeof(scale8));
1208 showme8("d", &d, sizeof(d));
1209 showme16("dst8", &dst8, sizeof(dst8));
1210 showme16("dst_b", &dst_b, sizeof(dst_b));
1211 showme16("dst_g", &dst_g, sizeof(dst_g));
1212 showme16("dst_r", &dst_r, sizeof(dst_r));
1213 showme8("sb", &sb, sizeof(sb));
1214 showme8("sg", &sg, sizeof(sg));
1215 showme8("sr", &sr, sizeof(sr));
1225 // skip x += UNROLL, since it's unchanged mod-4
1226 } while (count >= UNROLL);
1234 SkPMColor c = *src++;
1237 unsigned a = SkGetPackedA32(c);
1239 // dither and alpha are just temporary variables to work-around
1241 unsigned dither = DITHER_VALUE(x);
1242 unsigned alpha = SkAlpha255To256(a);
1243 int d = SkAlphaMul(dither, alpha);
1245 unsigned sr = SkGetPackedR32(c);
1246 unsigned sg = SkGetPackedG32(c);
1247 unsigned sb = SkGetPackedB32(c);
1248 sr = SkDITHER_R32_FOR_565(sr, d);
1249 sg = SkDITHER_G32_FOR_565(sg, d);
1250 sb = SkDITHER_B32_FOR_565(sb, d);
1252 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1253 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1254 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1255 // now src and dst expanded are in g:11 r:10 x:1 b:10
1256 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1260 } while (--count != 0);
1264 ///////////////////////////////////////////////////////////////////////////////
1266 #undef DEBUG_S32_OPAQUE_DITHER
1268 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1269 const SkPMColor* SK_RESTRICT src,
1270 int count, U8CPU alpha, int x, int y) {
1271 SkASSERT(255 == alpha);
1274 if (count >= UNROLL) {
1276 const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1277 d = vld1_u8(dstart);
1279 while (count >= UNROLL) {
1280 uint8x8_t sr, sg, sb;
1281 uint16x8_t dr, dg, db;
1285 register uint8x8_t d0 asm("d0");
1286 register uint8x8_t d1 asm("d1");
1287 register uint8x8_t d2 asm("d2");
1288 register uint8x8_t d3 asm("d3");
1291 "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */"
1292 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1296 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
1298 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
1302 /* XXX: if we want to prefetch, hide it in the above asm()
1303 * using the gcc __builtin_prefetch(), the prefetch will
1304 * fall to the bottom of the loop -- it won't stick up
1305 * at the top of the loop, just after the vld4.
1308 // sr = sr - (sr>>5) + d
1309 sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1310 dr = vaddl_u8(sr, d);
1312 // sb = sb - (sb>>5) + d
1313 sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1314 db = vaddl_u8(sb, d);
1316 // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1317 sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1318 dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1320 // pack high bits of each into 565 format (rgb, b is lsb)
1321 dst8 = vshrq_n_u16(db, 3);
1322 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1323 dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1326 vst1q_u16(dst, dst8);
1328 #if defined(DEBUG_S32_OPAQUE_DITHER)
1329 // always good to know if we generated good results
1331 int i, myx = x, myy = y;
1332 DITHER_565_SCAN(myy);
1333 for (i=0;i<UNROLL;i++) {
1334 // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
1335 SkPMColor c = src[i-8];
1336 unsigned dither = DITHER_VALUE(myx);
1337 uint16_t val = SkDitherRGB32To565(c, dither);
1338 if (val != dst[i]) {
1339 SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
1340 c, dither, val, dst[i], dstart[i]);
1348 // we don't need to increment src as the asm above has already done it
1350 x += UNROLL; // probably superfluous
1359 SkPMColor c = *src++;
1361 SkASSERT(SkGetPackedA32(c) == 255);
1363 unsigned dither = DITHER_VALUE(x);
1364 *dst++ = SkDitherRGB32To565(c, dither);
1366 } while (--count != 0);
1370 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
1378 memcpy(dst, src, count * sizeof(SkPMColor));
1383 unsigned colorA = SkGetPackedA32(color);
1384 if (255 == colorA) {
1385 sk_memset32(dst, color, count);
1389 unsigned scale = 256 - SkAlpha255To256(colorA);
1395 vcolor = vdupq_n_u32(color);
1397 // scale numerical interval [0-255], so load as 8 bits
1398 vscale = vdup_n_u8(scale);
1401 // load src color, 8 pixels, 4 64 bit registers
1402 // (and increment src).
1404 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1406 "vld1.32 %h[vsrc], [%[src]]!"
1407 : [vsrc] "=w" (vsrc), [src] "+r" (src)
1410 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1411 vsrc.val[0] = vld1_u32(src);
1412 vsrc.val[1] = vld1_u32(src+2);
1413 vsrc.val[2] = vld1_u32(src+4);
1414 vsrc.val[3] = vld1_u32(src+6);
1418 // multiply long by scale, 64 bits at a time,
1419 // destination into a 128 bit register.
1421 vtmp.val[0] = vmull_u8(vreinterpret_u8_u32(vsrc.val[0]), vscale);
1422 vtmp.val[1] = vmull_u8(vreinterpret_u8_u32(vsrc.val[1]), vscale);
1423 vtmp.val[2] = vmull_u8(vreinterpret_u8_u32(vsrc.val[2]), vscale);
1424 vtmp.val[3] = vmull_u8(vreinterpret_u8_u32(vsrc.val[3]), vscale);
1426 // shift the 128 bit registers, containing the 16
1427 // bit scaled values back to 8 bits, narrowing the
1428 // results to 64 bit registers.
1430 vres.val[0] = vcombine_u8(
1431 vshrn_n_u16(vtmp.val[0], 8),
1432 vshrn_n_u16(vtmp.val[1], 8));
1433 vres.val[1] = vcombine_u8(
1434 vshrn_n_u16(vtmp.val[2], 8),
1435 vshrn_n_u16(vtmp.val[3], 8));
1437 // adding back the color, using 128 bit registers.
1439 vdst.val[0] = vreinterpretq_u32_u8(vres.val[0] +
1440 vreinterpretq_u8_u32(vcolor));
1441 vdst.val[1] = vreinterpretq_u32_u8(vres.val[1] +
1442 vreinterpretq_u8_u32(vcolor));
1444 // store back the 8 calculated pixels (2 128 bit
1445 // registers), and increment dst.
1446 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1448 "vst1.32 %h[vdst], [%[dst]]!"
1453 #else // (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
1454 vst1q_u32(dst, vdst.val[0]);
1455 vst1q_u32(dst+4, vdst.val[1]);
1460 } while (count >= 8);
1464 *dst = color + SkAlphaMulQ(*src, scale);
1471 ///////////////////////////////////////////////////////////////////////////////
1473 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
1475 S32_D565_Opaque_neon,
1476 S32_D565_Blend_neon,
1477 S32A_D565_Opaque_neon,
1478 S32A_D565_Blend_neon,
1481 S32_D565_Opaque_Dither_neon,
1482 S32_D565_Blend_Dither_neon,
1483 S32A_D565_Opaque_Dither_neon,
1484 NULL, // S32A_D565_Blend_Dither
1487 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
1488 NULL, // S32_Opaque,
1489 S32_Blend_BlitRow32_neon, // S32_Blend,
1491 * We have two choices for S32A_Opaque procs. The one reads the src alpha
1492 * value and attempts to optimize accordingly. The optimization is
1493 * sensitive to the source content and is not a win in all cases. For
1494 * example, if there are a lot of transitions between the alpha states,
1495 * the performance will almost certainly be worse. However, for many
1496 * common cases the performance is equivalent or better than the standard
1497 * case where we do not inspect the src alpha.
1499 #if SK_A32_SHIFT == 24
1500 // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
1501 S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque,
1503 S32A_Opaque_BlitRow32_neon, // S32A_Opaque,
1505 S32A_Blend_BlitRow32_neon // S32A_Blend