1 #include "SkXfermode.h"
2 #include "SkXfermode_proccoeff.h"
3 #include "SkColorPriv.h"
6 #include "SkColor_opts_neon.h"
7 #include "SkXfermode_opts_arm_neon.h"
9 #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)
12 ////////////////////////////////////////////////////////////////////////////////
13 // NEONized skia functions
14 ////////////////////////////////////////////////////////////////////////////////
16 static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) {
20 tmp = vmull_u8(color, alpha);
21 tmp = vaddq_u16(tmp, vdupq_n_u16(128));
22 tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8));
24 ret = vshrn_n_u16(tmp, 8);
29 static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) {
32 ret = vmull_u8(color, alpha);
33 ret = vaddq_u16(ret, vdupq_n_u16(128));
34 ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));
36 ret = vshrq_n_u16(ret, 8);
41 static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
45 tmp = vmovn_high_u32(vmovn_u32(vreinterpretq_u32_s32(p1)),
46 vreinterpretq_u32_s32(p2));
48 tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
49 vmovn_u32(vreinterpretq_u32_s32(p2)));
52 tmp += vdupq_n_u16(128);
53 tmp += vshrq_n_u16(tmp, 8);
55 return vshrn_n_u16(tmp, 8);
58 static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {
59 prod += vdupq_n_u16(128);
60 prod += vshrq_n_u16(prod, 8);
62 return vshrq_n_u16(prod, 8);
65 static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) {
67 uint32x4_t cmp1, cmp2;
69 uint8x8_t cmp8, cmp8_1;
72 cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
73 cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
75 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
77 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
79 cmp8_1 = vmovn_u16(cmp16);
85 cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
86 cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
88 cmp16 = vmovn_high_u32(vmovn_u32(cmp1), cmp2);
90 cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
92 cmp8 = vmovn_u16(cmp16);
94 // Insert 255 where true
95 ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);
98 uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);
100 // Insert where false and previous test false
101 cmp8 = cmp8 | cmp8_1;
102 ret = vbsl_u8(cmp8, ret, div);
104 // Return the final combination
108 ////////////////////////////////////////////////////////////////////////////////
110 ////////////////////////////////////////////////////////////////////////////////
112 // kSrcATop_Mode, //!< [Da, Sc * Da + (1 - Sa) * Dc]
113 SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
114 unsigned sa = SkGetPackedA32(src);
115 unsigned da = SkGetPackedA32(dst);
116 unsigned isa = 255 - sa;
118 uint8x8_t vda, visa, vsrc, vdst;
121 visa = vdup_n_u8(isa);
123 uint16x8_t vsrc_wide, vdst_wide;
124 vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src)));
125 vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst)));
127 vsrc_wide += vdupq_n_u16(128);
128 vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
130 vdst_wide += vdupq_n_u16(128);
131 vdst_wide += vshrq_n_u16(vdst_wide, 8);
133 vsrc = vshrn_n_u16(vsrc_wide, 8);
134 vdst = vshrn_n_u16(vdst_wide, 8);
137 vsrc = vset_lane_u8(da, vsrc, 3);
139 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
142 // kDstATop_Mode, //!< [Sa, Sa * Dc + Sc * (1 - Da)]
143 SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
144 unsigned sa = SkGetPackedA32(src);
145 unsigned da = SkGetPackedA32(dst);
146 unsigned ida = 255 - da;
148 uint8x8_t vsa, vida, vsrc, vdst;
151 vida = vdup_n_u8(ida);
153 uint16x8_t vsrc_wide, vdst_wide;
154 vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src)));
155 vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst)));
157 vsrc_wide += vdupq_n_u16(128);
158 vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
160 vdst_wide += vdupq_n_u16(128);
161 vdst_wide += vshrq_n_u16(vdst_wide, 8);
163 vsrc = vshrn_n_u16(vsrc_wide, 8);
164 vdst = vshrn_n_u16(vdst_wide, 8);
167 vsrc = vset_lane_u8(sa, vsrc, 3);
169 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
172 // kXor_Mode [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc]
173 SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) {
174 unsigned sa = SkGetPackedA32(src);
175 unsigned da = SkGetPackedA32(dst);
176 unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1);
177 unsigned isa = 255 - sa;
178 unsigned ida = 255 - da;
180 uint8x8_t vsrc, vdst, visa, vida;
181 uint16x8_t vsrc_wide, vdst_wide;
183 visa = vdup_n_u8(isa);
184 vida = vdup_n_u8(ida);
185 vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
186 vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
188 vsrc_wide = vmull_u8(vsrc, vida);
189 vdst_wide = vmull_u8(vdst, visa);
191 vsrc_wide += vdupq_n_u16(128);
192 vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
194 vdst_wide += vdupq_n_u16(128);
195 vdst_wide += vshrq_n_u16(vdst_wide, 8);
197 vsrc = vshrn_n_u16(vsrc_wide, 8);
198 vdst = vshrn_n_u16(vdst_wide, 8);
202 vsrc = vset_lane_u8(ret_alpha, vsrc, 3);
204 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
208 SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) {
209 uint8x8_t vsrc, vdst;
210 vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
211 vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
212 vsrc = vqadd_u8(vsrc, vdst);
214 return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
218 SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) {
219 uint8x8_t vsrc, vdst, vres;
220 uint16x8_t vres_wide;
222 vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
223 vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
225 vres_wide = vmull_u8(vsrc, vdst);
227 vres_wide += vdupq_n_u16(128);
228 vres_wide += vshrq_n_u16(vres_wide, 8);
230 vres = vshrn_n_u16(vres_wide, 8);
232 return vget_lane_u32(vreinterpret_u32_u8(vres), 0);
235 ////////////////////////////////////////////////////////////////////////////////
236 // 8 pixels modeprocs
237 ////////////////////////////////////////////////////////////////////////////////
239 uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
241 uint16x8_t src_scale;
243 src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
245 ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale);
246 ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale);
247 ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale);
248 ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale);
253 uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
257 scale = SkAlpha255To256_neon8(dst.val[NEON_A]);
259 ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale);
260 ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale);
261 ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale);
262 ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale);
267 uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
271 scale = SkAlpha255To256_neon8(src.val[NEON_A]);
273 ret = SkAlphaMulQ_neon8(dst, scale);
278 uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
280 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
282 ret = SkAlphaMulQ_neon8(src, scale);
287 uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
289 uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]);
291 ret = SkAlphaMulQ_neon8(dst, scale);
296 uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
300 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
302 ret.val[NEON_A] = dst.val[NEON_A];
303 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A])
304 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
305 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A])
306 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
307 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A])
308 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
313 uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
317 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
319 ret.val[NEON_A] = src.val[NEON_A];
320 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
321 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]);
322 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
323 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]);
324 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
325 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]);
330 uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
333 uint16x8_t tmp_wide, tmp_wide2;
335 isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
336 ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
339 tmp_wide = vmovl_u8(src.val[NEON_A]);
340 tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]);
341 tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1);
342 tmp_wide = vsubq_u16(tmp_wide, tmp_wide2);
343 ret.val[NEON_A] = vmovn_u16(tmp_wide);
346 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
347 + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
348 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
349 + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
350 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
351 + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
356 uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
359 ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]);
360 ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]);
361 ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]);
362 ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]);
367 uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
370 ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]);
371 ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]);
372 ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]);
373 ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]);
378 static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) {
381 tmp = vaddl_u8(a, b);
382 tmp -= SkAlphaMulAlpha_neon8_16(a, b);
384 return vmovn_u16(tmp);
387 uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
390 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
391 ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]);
392 ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]);
393 ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]);
398 template <bool overlay>
399 static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
400 uint8x8_t sa, uint8x8_t da) {
402 * In the end we're gonna use (rc + tmp) with a different rc
403 * coming from an alternative.
404 * The whole value (rc + tmp) can always be expressed as
405 * VAL = COM - SUB in the if case
406 * VAL = COM + SUB - sa*da in the else case
408 * with COM = 255 * (sc + dc)
409 * and SUB = sc*da + dc*sa - 2*dc*sc
412 // Prepare common subexpressions
413 uint16x8_t const255 = vdupq_n_u16(255);
414 uint16x8_t sc_plus_dc = vaddl_u8(sc, dc);
415 uint16x8_t scda = vmull_u8(sc, da);
416 uint16x8_t dcsa = vmull_u8(dc, sa);
417 uint16x8_t sada = vmull_u8(sa, da);
419 // Prepare non common subexpressions
421 uint32x4_t scdc2_1, scdc2_2;
423 dc2 = vshll_n_u8(dc, 1);
424 scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
426 scdc2_2 = vmull_high_u16(dc2, vmovl_u8(sc));
428 scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
431 sc2 = vshll_n_u8(sc, 1);
432 scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
434 scdc2_2 = vmull_high_u16(sc2, vmovl_u8(dc));
436 scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
441 int32x4_t com1, com2;
442 com1 = vreinterpretq_s32_u32(
443 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
444 com2 = vreinterpretq_s32_u32(
446 vmull_high_u16(const255, sc_plus_dc));
448 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
452 int32x4_t sub1, sub2;
453 sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
455 sub2 = vreinterpretq_s32_u32(vaddl_high_u16(scda, dcsa));
457 sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
459 sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
460 sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
462 // Compare 2*dc <= da
466 cmp = vcleq_u16(dc2, vmovl_u8(da));
468 cmp = vcleq_u16(sc2, vmovl_u8(sa));
472 int32x4_t val1_1, val1_2;
473 int32x4_t val2_1, val2_2;
474 uint32x4_t cmp1, cmp2;
476 // Doing a signed lengthening allows to save a few instructions
477 // thanks to sign extension.
478 cmp1 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_low_u16(cmp))));
480 cmp2 = vreinterpretq_u32_s32(vmovl_high_s16(vreinterpretq_s16_u16(cmp)));
482 cmp2 = vreinterpretq_u32_s32(vmovl_s16(vreinterpret_s16_u16(vget_high_u16(cmp))));
486 val1_1 = com1 - sub1;
487 val1_2 = com2 - sub2;
489 // Calc COM + SUB - sa*da
490 val2_1 = com1 + sub1;
491 val2_2 = com2 + sub2;
493 val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
495 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_high_u16(sada)));
497 val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
500 // Insert where needed
501 val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
502 val1_2 = vbslq_s32(cmp2, val1_2, val2_2);
504 // Call the clamp_div255round function
505 return clamp_div255round_simd8_32(val1_1, val1_2);
508 static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,
509 uint8x8_t sa, uint8x8_t da) {
510 return overlay_hardlight_color<true>(sc, dc, sa, da);
513 uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
516 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
517 ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R],
518 src.val[NEON_A], dst.val[NEON_A]);
519 ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G],
520 src.val[NEON_A], dst.val[NEON_A]);
521 ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B],
522 src.val[NEON_A], dst.val[NEON_A]);
527 template <bool lighten>
528 static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc,
529 uint8x8_t sa, uint8x8_t da) {
530 uint16x8_t sd, ds, cmp, tmp, tmp2;
533 sd = vmull_u8(sc, da);
534 ds = vmull_u8(dc, sa);
538 cmp = vcgtq_u16(sd, ds);
540 cmp = vcltq_u16(sd, ds);
544 tmp = vaddl_u8(sc, dc);
546 tmp -= SkDiv255Round_neon8_16_16(ds);
549 tmp2 -= SkDiv255Round_neon8_16_16(sd);
551 // Insert where needed
552 tmp = vbslq_u16(cmp, tmp, tmp2);
554 return vmovn_u16(tmp);
557 static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc,
558 uint8x8_t sa, uint8x8_t da) {
559 return lighten_darken_color<false>(sc, dc, sa, da);
562 uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
565 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
566 ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R],
567 src.val[NEON_A], dst.val[NEON_A]);
568 ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G],
569 src.val[NEON_A], dst.val[NEON_A]);
570 ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B],
571 src.val[NEON_A], dst.val[NEON_A]);
576 static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc,
577 uint8x8_t sa, uint8x8_t da) {
578 return lighten_darken_color<true>(sc, dc, sa, da);
581 uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
584 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
585 ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R],
586 src.val[NEON_A], dst.val[NEON_A]);
587 ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G],
588 src.val[NEON_A], dst.val[NEON_A]);
589 ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B],
590 src.val[NEON_A], dst.val[NEON_A]);
595 static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc,
596 uint8x8_t sa, uint8x8_t da) {
597 return overlay_hardlight_color<false>(sc, dc, sa, da);
600 uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
603 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
604 ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R],
605 src.val[NEON_A], dst.val[NEON_A]);
606 ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G],
607 src.val[NEON_A], dst.val[NEON_A]);
608 ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B],
609 src.val[NEON_A], dst.val[NEON_A]);
614 static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc,
615 uint8x8_t sa, uint8x8_t da) {
616 uint16x8_t sd, ds, tmp;
619 sd = vmull_u8(sc, da);
620 ds = vmull_u8(dc, sa);
622 tmp = vminq_u16(sd, ds);
623 tmp = SkDiv255Round_neon8_16_16(tmp);
624 tmp = vshlq_n_u16(tmp, 1);
626 val = vreinterpretq_s16_u16(vaddl_u8(sc, dc));
628 val -= vreinterpretq_s16_u16(tmp);
630 val = vmaxq_s16(val, vdupq_n_s16(0));
631 val = vminq_s16(val, vdupq_n_s16(255));
633 return vmovn_u16(vreinterpretq_u16_s16(val));
636 uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
639 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
640 ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R],
641 src.val[NEON_A], dst.val[NEON_A]);
642 ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G],
643 src.val[NEON_A], dst.val[NEON_A]);
644 ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B],
645 src.val[NEON_A], dst.val[NEON_A]);
650 static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
651 uint8x8_t sa, uint8x8_t da) {
652 /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */
654 uint16x8_t sc_plus_dc, scdc, const255;
655 int32x4_t term1_1, term1_2, term2_1, term2_2;
657 /* Calc (sc + dc) and (sc * dc) */
658 sc_plus_dc = vaddl_u8(sc, dc);
659 scdc = vmull_u8(sc, dc);
661 /* Prepare constants */
662 const255 = vdupq_n_u16(255);
664 /* Calc the first term */
665 term1_1 = vreinterpretq_s32_u32(
666 vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
667 term1_2 = vreinterpretq_s32_u32(
669 vmull_high_u16(const255, sc_plus_dc));
671 vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
674 /* Calc the second term */
675 term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
677 term2_2 = vreinterpretq_s32_u32(vshll_high_n_u16(scdc, 1));
679 term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
682 return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
685 uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
688 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
689 ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],
690 src.val[NEON_A], dst.val[NEON_A]);
691 ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],
692 src.val[NEON_A], dst.val[NEON_A]);
693 ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],
694 src.val[NEON_A], dst.val[NEON_A]);
699 static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
700 uint8x8_t sa, uint8x8_t da) {
701 uint32x4_t val1, val2;
702 uint16x8_t scdc, t1, t2;
704 t1 = vmull_u8(sc, vdup_n_u8(255) - da);
705 t2 = vmull_u8(dc, vdup_n_u8(255) - sa);
706 scdc = vmull_u8(sc, dc);
708 val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
710 val2 = vaddl_high_u16(t1, t2);
712 val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
715 val1 = vaddw_u16(val1, vget_low_u16(scdc));
717 val2 = vaddw_high_u16(val2, scdc);
719 val2 = vaddw_u16(val2, vget_high_u16(scdc));
722 return clamp_div255round_simd8_32(
723 vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
726 uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
729 ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
730 ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],
731 src.val[NEON_A], dst.val[NEON_A]);
732 ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G],
733 src.val[NEON_A], dst.val[NEON_A]);
734 ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B],
735 src.val[NEON_A], dst.val[NEON_A]);
740 ////////////////////////////////////////////////////////////////////////////////
742 typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
744 extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
746 SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
747 : INHERITED(buffer) {
748 fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
751 void SkNEONProcCoeffXfermode::xfer32(SkPMColor* SK_RESTRICT dst,
752 const SkPMColor* SK_RESTRICT src, int count,
753 const SkAlpha* SK_RESTRICT aa) const {
754 SkASSERT(dst && src && count >= 0);
756 SkXfermodeProc proc = this->getProc();
757 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
758 SkASSERT(procSIMD != NULL);
761 // Unrolled NEON code
762 // We'd like to just do this (modulo a few casts):
763 // vst4_u8(dst, procSIMD(vld4_u8(src), vld4_u8(dst)));
766 // but that tends to generate miserable code. Here are a bunch of faster
767 // workarounds for different architectures and compilers.
771 uint8x8x4_t vsrc, vdst, vres;
772 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
774 "vld4.u8 %h[vsrc], [%[src]]! \t\n"
775 "vld4.u8 %h[vdst], [%[dst]] \t\n"
776 : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)
781 register uint8x8_t d0 asm("d0");
782 register uint8x8_t d1 asm("d1");
783 register uint8x8_t d2 asm("d2");
784 register uint8x8_t d3 asm("d3");
785 register uint8x8_t d4 asm("d4");
786 register uint8x8_t d5 asm("d5");
787 register uint8x8_t d6 asm("d6");
788 register uint8x8_t d7 asm("d7");
791 "vld4.u8 {d0-d3},[%[src]]!;"
792 "vld4.u8 {d4-d7},[%[dst]];"
793 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
794 "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),
799 vsrc.val[0] = d0; vdst.val[0] = d4;
800 vsrc.val[1] = d1; vdst.val[1] = d5;
801 vsrc.val[2] = d2; vdst.val[2] = d6;
802 vsrc.val[3] = d3; vdst.val[3] = d7;
805 vres = procSIMD(vsrc, vdst);
807 vst4_u8((uint8_t*)dst, vres);
811 #else // #ifdef SK_CPU_ARM32
814 "ld4 {v0.8b - v3.8b}, [%[src]], #32 \t\n"
815 "ld4 {v4.8b - v7.8b}, [%[dst]] \t\n"
817 "st4 {v0.8b - v3.8b}, [%[dst]], #32 \t\n"
818 : [src] "+&r" (src), [dst] "+&r" (dst)
819 : [proc] "r" (procSIMD)
821 /* We don't know what proc is going to clobber so we must
822 * add everything that is not callee-saved.
824 "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
825 "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18",
826 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
827 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
828 "v27", "v28", "v29", "v30", "v31"
831 #endif // #ifdef SK_CPU_ARM32
836 for (int i = 0; i < count; i++) {
837 dst[i] = proc(src[i], dst[i]);
840 for (int i = count - 1; i >= 0; --i) {
843 SkPMColor dstC = dst[i];
844 SkPMColor C = proc(src[i], dstC);
846 C = SkFourByteInterp_neon(C, dstC, a);
854 void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
855 const SkPMColor* SK_RESTRICT src, int count,
856 const SkAlpha* SK_RESTRICT aa) const {
857 SkASSERT(dst && src && count >= 0);
859 SkXfermodeProc proc = this->getProc();
860 SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
861 SkASSERT(procSIMD != NULL);
865 uint16x8_t vdst, vres16;
866 uint8x8x4_t vdst32, vsrc, vres;
868 vdst = vld1q_u16(dst);
871 vsrc = vld4_u8((uint8_t*)src);
873 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
875 "vld4.u8 %h[vsrc], [%[src]]! \t\n"
876 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
880 register uint8x8_t d0 asm("d0");
881 register uint8x8_t d1 asm("d1");
882 register uint8x8_t d2 asm("d2");
883 register uint8x8_t d3 asm("d3");
886 "vld4.u8 {d0-d3},[%[src]]!;"
887 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
896 #endif // #ifdef SK_CPU_ARM64
898 vdst32 = SkPixel16ToPixel32_neon8(vdst);
899 vres = procSIMD(vsrc, vdst32);
900 vres16 = SkPixel32ToPixel16_neon8(vres);
902 vst1q_u16(dst, vres16);
910 for (int i = 0; i < count; i++) {
911 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
912 dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
915 for (int i = count - 1; i >= 0; --i) {
918 SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
919 SkPMColor C = proc(src[i], dstC);
921 C = SkFourByteInterp_neon(C, dstC, a);
923 dst[i] = SkPixel32ToPixel16_ToU16(C);
929 #ifndef SK_IGNORE_TO_STRING
930 void SkNEONProcCoeffXfermode::toString(SkString* str) const {
931 this->INHERITED::toString(str);
935 ////////////////////////////////////////////////////////////////////////////////
937 SkXfermodeProcSIMD gNEONXfermodeProcs[] = {
941 NULL, // kSrcOver_Mode
942 dstover_modeproc_neon8,
943 srcin_modeproc_neon8,
944 dstin_modeproc_neon8,
945 srcout_modeproc_neon8,
946 dstout_modeproc_neon8,
947 srcatop_modeproc_neon8,
948 dstatop_modeproc_neon8,
951 modulate_modeproc_neon8,
952 screen_modeproc_neon8,
954 overlay_modeproc_neon8,
955 darken_modeproc_neon8,
956 lighten_modeproc_neon8,
957 NULL, // kColorDodge_Mode
958 NULL, // kColorBurn_Mode
959 hardlight_modeproc_neon8,
960 NULL, // kSoftLight_Mode
961 difference_modeproc_neon8,
962 exclusion_modeproc_neon8,
963 multiply_modeproc_neon8,
966 NULL, // kSaturation_Mode
968 NULL, // kLuminosity_Mode
972 SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1,
976 SkXfermodeProc gNEONXfermodeProcs1[] = {
980 NULL, // kSrcOver_Mode
981 NULL, // kDstOver_Mode
984 NULL, // kSrcOut_Mode
985 NULL, // kDstOut_Mode
986 srcatop_modeproc_neon,
987 dstatop_modeproc_neon,
990 modulate_modeproc_neon,
991 NULL, // kScreen_Mode
993 NULL, // kOverlay_Mode
994 NULL, // kDarken_Mode
995 NULL, // kLighten_Mode
996 NULL, // kColorDodge_Mode
997 NULL, // kColorBurn_Mode
998 NULL, // kHardLight_Mode
999 NULL, // kSoftLight_Mode
1000 NULL, // kDifference_Mode
1001 NULL, // kExclusion_Mode
1002 NULL, // kMultiply_Mode
1005 NULL, // kSaturation_Mode
1006 NULL, // kColor_Mode
1007 NULL, // kLuminosity_Mode
1011 SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1,
1015 SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec,
1016 SkXfermode::Mode mode) {
1018 void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]);
1020 if (procSIMD != NULL) {
1021 return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));
1026 SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {
1027 return gNEONXfermodeProcs1[mode];