1 /* NEON optimized code (C) COPYRIGHT 2009 Motorola
3 * Use of this source code is governed by a BSD-style license that can be
4 * found in the LICENSE file.
8 * Modifications done in-house at Motorola
10 * this is a clone of SkBitmapProcState_matrix.h
11 * and has been tuned to work with the NEON unit.
13 * Still going back and forth between whether this approach
14 * (clone the entire SkBitmapProcState_matrix.h file or
15 * if I should put just the modified routines in here and
16 * then use a construct like #define DONT_DO_THIS_FUNCTION or
17 * something like that...
19 * This is for the RepeatX_RepeatY part of the world
26 * This has been modified on the knowledge that (at the time)
27 * we had the following macro definitions in the parent file
29 * #define MAKENAME(suffix) RepeatX_RepeatY ## suffix
30 * #define TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
31 * #define TILEY_PROCF(fy, max) (((fy) & 0xFFFF) * ((max) + 1) >> 16)
32 * #define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
33 * #define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF)
36 /* SkClampMax(val,max) -- bound to 0..max */
38 #define SCALE_NOFILTER_NAME MAKENAME(_nofilter_scale)
39 #define SCALE_FILTER_NAME MAKENAME(_filter_scale)
40 #define AFFINE_NOFILTER_NAME MAKENAME(_nofilter_affine)
41 #define AFFINE_FILTER_NAME MAKENAME(_filter_affine)
42 #define PERSP_NOFILTER_NAME MAKENAME(_nofilter_persp)
43 #define PERSP_FILTER_NAME MAKENAME(_filter_persp)
45 #define PACK_FILTER_X_NAME MAKENAME(_pack_filter_x)
46 #define PACK_FILTER_Y_NAME MAKENAME(_pack_filter_y)
49 #define PREAMBLE(state)
50 #define PREAMBLE_PARAM_X
51 #define PREAMBLE_PARAM_Y
52 #define PREAMBLE_ARG_X
53 #define PREAMBLE_ARG_Y
56 static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
57 uint32_t xy[], int count, int x, int y) {
58 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
59 SkMatrix::kScale_Mask)) == 0);
62 // we store y, x, x, x, x, x
64 const unsigned maxX = s.fBitmap->width() - 1;
68 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
69 SkIntToScalar(y) + SK_ScalarHalf, &pt);
70 fx = SkScalarToFixed(pt.fY);
71 const unsigned maxY = s.fBitmap->height() - 1;
72 *xy++ = TILEY_PROCF(fx, maxY);
73 fx = SkScalarToFixed(pt.fX);
77 // all of the following X values must be 0
78 memset(xy, 0, count * sizeof(uint16_t));
82 const SkFixed dx = s.fInvSx;
84 #ifdef CHECK_FOR_DECAL
85 // test if we don't need to apply the tile proc
86 if ((unsigned)(fx >> 16) <= maxX &&
87 (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) {
88 decal_nofilter_scale_neon(xy, fx, dx, count);
94 /* RBE: very much like done in decal_nofilter ,
95 * but some processing of the 'fx' information
96 * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
99 /* SkFixed is 16.16 fixed point */
101 SkFixed dx4 = dx2+dx2;
102 SkFixed dx8 = dx4+dx4;
104 /* now build fx/fx+dx/fx+2dx/fx+3dx */
105 SkFixed fx1, fx2, fx3;
106 int32x4_t lbase, hbase;
107 int16_t *dst16 = (int16_t *)xy;
113 lbase = vdupq_n_s32(fx);
114 lbase = vsetq_lane_s32(fx1, lbase, 1);
115 lbase = vsetq_lane_s32(fx2, lbase, 2);
116 lbase = vsetq_lane_s32(fx3, lbase, 3);
117 hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));
126 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
127 /* mask to low 16 [would like to use uzp tricks) */
128 lout = vandq_s32(lbase, vdupq_n_s32(0xffff));
129 hout = vandq_s32(hbase, vdupq_n_s32(0xffff));
130 /* bare multiplication, not SkFixedMul */
131 lout = vmulq_s32(lout, vdupq_n_s32(maxX+1));
132 hout = vmulq_s32(hout, vdupq_n_s32(maxX+1));
134 /* extraction, using uzp */
135 /* this is ok -- we want all hi(lout)s then all hi(hout)s */
136 asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
137 hi16 = vreinterpretq_s16_s32(hout);
138 vst1q_s16(dst16, hi16);
140 /* bump our base on to the next */
141 lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
142 hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
146 } while (count >= 8);
147 xy = (uint32_t *) dst16;
149 uint16_t* xx = (uint16_t*)xy;
150 for (i = count; i > 0; --i) {
151 *xx++ = TILEX_PROCF(fx, maxX); fx += dx;
156 // note: we could special-case on a matrix which is skewed in X but not Y.
157 // this would require a more general setup thatn SCALE does, but could use
158 // SCALE's inner loop that only looks at dx
161 static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
162 uint32_t xy[], int count, int x, int y) {
163 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
164 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
165 SkMatrix::kScale_Mask |
166 SkMatrix::kAffine_Mask)) == 0);
170 s.fInvProc(s.fInvMatrix,
171 SkIntToScalar(x) + SK_ScalarHalf,
172 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
174 SkFixed fx = SkScalarToFixed(srcPt.fX);
175 SkFixed fy = SkScalarToFixed(srcPt.fY);
176 SkFixed dx = s.fInvSx;
177 SkFixed dy = s.fInvKy;
178 int maxX = s.fBitmap->width() - 1;
179 int maxY = s.fBitmap->height() - 1;
184 SkFixed bfx = fx, bfy=fy, bdx=dx, bdy=dy;
188 if (0) { extern void rbe(void); rbe(); }
190 /* RBE: benchmarks show this eats up time; can we neonize it? */
191 /* RBE: very much like done in decal_nofilter ,
192 * but some processing of the 'fx' information
193 * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16)
196 /* SkFixed is 16.16 fixed point */
200 /* now build fx/fx+dx/fx+2dx/fx+3dx */
201 int32x4_t xbase, ybase;
202 int16_t *dst16 = (int16_t *)xy;
204 /* synthesize 4x for both X and Y */
205 xbase = vdupq_n_s32(fx);
206 xbase = vsetq_lane_s32(fx+dx, xbase, 1);
207 xbase = vsetq_lane_s32(fx+dx+dx, xbase, 2);
208 xbase = vsetq_lane_s32(fx+dx+dx+dx, xbase, 3);
210 ybase = vdupq_n_s32(fy);
211 ybase = vsetq_lane_s32(fy+dy, ybase, 1);
212 ybase = vsetq_lane_s32(fy+dy+dy, ybase, 2);
213 ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3);
221 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
222 /* mask to low 16 [would like to use uzp tricks) */
223 xout = vandq_s32(xbase, vdupq_n_s32(0xffff));
224 yout = vandq_s32(ybase, vdupq_n_s32(0xffff));
225 /* bare multiplication, not SkFixedMul */
226 xout = vmulq_s32(xout, vdupq_n_s32(maxX+1));
227 yout = vmulq_s32(yout, vdupq_n_s32(maxY+1));
229 /* put hi16 from xout over low16 from yout */
230 yout = vsriq_n_s32(yout, xout, 16);
232 /* and then yout has the interleaved upper 16's */
233 hi16 = vreinterpretq_s16_s32(yout);
234 vst1q_s16(dst16, hi16);
236 /* bump preserved base & on to the next */
237 xbase = vaddq_s32 (xbase, vdupq_n_s32(dx4));
238 ybase = vaddq_s32 (ybase, vdupq_n_s32(dy4));
239 dst16 += 8; /* 8 x16 aka 4x32 */
243 } while (count >= 4);
244 xy = (uint32_t *) dst16;
248 /* diagnostics... see whether we agree with the NEON code */
250 uint32_t *myxy = oxy;
252 SkFixed ofx = bfx, ofy= bfy, odx= bdx, ody= bdy;
253 for (myi = ocount; myi > 0; --myi) {
254 uint32_t val = (TILEY_PROCF(ofy, maxY) << 16) | TILEX_PROCF(ofx, maxX);
255 if (val != *myxy++) {
259 ofx += odx; ofy += ody;
262 SkDebugf("repeat-nofilter-affine fails\n");
263 SkDebugf("count %d myi %d\n", ocount, myi);
264 SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n",
266 SkDebugf("maxX %08x maxY %08x\n", maxX, maxY);
270 for (int i = count; i > 0; --i) {
271 /* fx, fy, dx, dy are all 32 bit 16.16 fixed point */
272 /* (((fx) & 0xFFFF) * ((max) + 1) >> 16) */
273 *xy++ = (TILEY_PROCF(fy, maxY) << 16) | TILEX_PROCF(fx, maxX);
278 static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
279 uint32_t* SK_RESTRICT xy,
280 int count, int x, int y) {
281 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
284 int maxX = s.fBitmap->width() - 1;
285 int maxY = s.fBitmap->height() - 1;
287 SkPerspIter iter(s.fInvMatrix,
288 SkIntToScalar(x) + SK_ScalarHalf,
289 SkIntToScalar(y) + SK_ScalarHalf, count);
291 while ((count = iter.next()) != 0) {
292 const SkFixed* SK_RESTRICT srcXY = iter.getXY();
295 /* TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */
296 /* it's a little more complicated than what I did for the
297 * clamp case -- where I could immediately snip to the top
298 * 16 bits and do my min/max games there.
299 * ... might only be able to get 4x unrolling here
302 /* vld2 to get a set of 32x4's ... */
303 /* do the tile[xy]_procf operations */
304 /* which includes doing vuzp to get hi16's */
306 /* -- inner loop (other than vld2) can be had from above */
308 /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1...
309 * but we immediately discard the low 16 bits...
310 * so what we're going to do is vld4, which will give us
311 * xlo,xhi,ylo,yhi distribution and we can ignore the 'lo'
314 if (0) { extern void rbe(void); rbe(); }
316 int32_t *mysrc = (int32_t *) srcXY;
317 int16_t *mydst = (int16_t *) xy;
319 int32x4_t x, y, x2, y2;
322 /* read array of x,y,x,y,x,y */
323 /* vld2 does the de-interleaving for us */
324 /* isolate reg-bound scopes; gcc will minimize register
325 * motion if possible; this ensures that we don't lose
326 * a register across a debugging call because it happens
327 * to be bound into a call-clobbered register
330 register int32x4_t q0 asm("q0");
331 register int32x4_t q1 asm("q1");
332 asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */"
333 : "=w" (q0), "=w" (q1)
339 /* offset == 256 bits == 32 bytes == 8 longs */
341 register int32x4_t q2 asm("q2");
342 register int32x4_t q3 asm("q3");
343 asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */"
344 : "=w" (q2), "=w" (q3)
350 /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */
351 /* mask to low 16 [would like to use uzp tricks) */
352 /* bare multiplication, not SkFixedMul */
353 x = vandq_s32(x, vdupq_n_s32(0xffff));
354 x = vmulq_s32(x, vdupq_n_s32(maxX+1));
355 y = vandq_s32(y, vdupq_n_s32(0xffff));
356 y = vmulq_s32(y, vdupq_n_s32(maxY+1));
358 x2 = vandq_s32(x2, vdupq_n_s32(0xffff));
359 x2 = vmulq_s32(x2, vdupq_n_s32(maxX+1));
360 y2 = vandq_s32(y2, vdupq_n_s32(0xffff));
361 y2 = vmulq_s32(y2, vdupq_n_s32(maxY+1));
363 /* now collect interleaved high 16's */
364 /* (hi-x, hi-y)4 (hi-x2; hi-y2)4 */
366 /* extraction, using uzp, leaves hi16's in y */
367 y = vsriq_n_s32(y, x, 16);
368 hi = vreinterpretq_s16_s32(y);
369 vst1q_s16(mydst, hi);
371 /* and likewise for the second 8 entries */
372 y2 = vsriq_n_s32(y2, x2, 16);
373 hi2 = vreinterpretq_s16_s32(y2);
374 vst1q_s16(mydst+8, hi2);
376 /* XXX: gcc isn't interleaving these with the NEON ops
377 * but i think that all the scoreboarding works out */
378 count -= 8; /* 8 iterations */
379 mysrc += 16; /* 16 longs */
380 mydst += 16; /* 16 shorts, aka 8 longs */
381 } while (count >= 8);
382 /* get xy and srcXY fixed up */
383 srcXY = (const SkFixed *) mysrc;
384 xy = (uint32_t *) mydst;
386 while (--count >= 0) {
387 *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
388 TILEX_PROCF(srcXY[0], maxX);
394 //////////////////////////////////////////////////////////////////////////////
396 static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
397 SkFixed one PREAMBLE_PARAM_Y) {
398 unsigned i = TILEY_PROCF(f, max);
399 i = (i << 4) | TILEY_LOW_BITS(f, max);
400 return (i << 14) | (TILEY_PROCF((f + one), max));
403 static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
404 SkFixed one PREAMBLE_PARAM_X) {
405 unsigned i = TILEX_PROCF(f, max);
406 i = (i << 4) | TILEX_LOW_BITS(f, max);
407 return (i << 14) | (TILEX_PROCF((f + one), max));
410 static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
411 uint32_t xy[], int count, int x, int y) {
412 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
413 SkMatrix::kScale_Mask)) == 0);
414 SkASSERT(s.fInvKy == 0);
418 const unsigned maxX = s.fBitmap->width() - 1;
419 const SkFixed one = s.fFilterOneX;
420 const SkFractionalInt dx = s.fInvSxFractionalInt;
425 s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
426 SkIntToScalar(y) + SK_ScalarHalf, &pt);
427 const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
428 const unsigned maxY = s.fBitmap->height() - 1;
429 // compute our two Y values up front
430 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
432 fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
435 #ifdef CHECK_FOR_DECAL
436 // test if we don't need to apply the tile proc
437 if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
438 decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
439 SkFractionalIntToFixed(dx), count);
444 SkFixed fixedFx = SkFractionalIntToFixed(fx);
445 *xy++ = PACK_FILTER_X_NAME(fixedFx, maxX, one PREAMBLE_ARG_X);
447 } while (--count != 0);
451 static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
452 uint32_t xy[], int count, int x, int y) {
453 SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
454 SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
455 SkMatrix::kScale_Mask |
456 SkMatrix::kAffine_Mask)) == 0);
460 s.fInvProc(s.fInvMatrix,
461 SkIntToScalar(x) + SK_ScalarHalf,
462 SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
464 SkFixed oneX = s.fFilterOneX;
465 SkFixed oneY = s.fFilterOneY;
466 SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
467 SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
468 SkFixed dx = s.fInvSx;
469 SkFixed dy = s.fInvKy;
470 unsigned maxX = s.fBitmap->width() - 1;
471 unsigned maxY = s.fBitmap->height() - 1;
474 *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
476 *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
478 } while (--count != 0);
481 static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
482 uint32_t* SK_RESTRICT xy, int count,
484 SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
486 extern void rbe(void);
489 unsigned maxX = s.fBitmap->width() - 1;
490 unsigned maxY = s.fBitmap->height() - 1;
491 SkFixed oneX = s.fFilterOneX;
492 SkFixed oneY = s.fFilterOneY;
496 SkPerspIter iter(s.fInvMatrix,
497 SkIntToScalar(x) + SK_ScalarHalf,
498 SkIntToScalar(y) + SK_ScalarHalf, count);
500 while ((count = iter.next()) != 0) {
501 const SkFixed* SK_RESTRICT srcXY = iter.getXY();
503 *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
504 oneY PREAMBLE_ARG_Y);
505 *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
506 oneX PREAMBLE_ARG_X);
508 } while (--count != 0);
512 const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
515 AFFINE_NOFILTER_NAME,
524 #ifdef CHECK_FOR_DECAL
525 #undef CHECK_FOR_DECAL
528 #undef SCALE_NOFILTER_NAME
529 #undef SCALE_FILTER_NAME
530 #undef AFFINE_NOFILTER_NAME
531 #undef AFFINE_FILTER_NAME
532 #undef PERSP_NOFILTER_NAME
533 #undef PERSP_FILTER_NAME
536 #undef PREAMBLE_PARAM_X
537 #undef PREAMBLE_PARAM_Y
538 #undef PREAMBLE_ARG_X
539 #undef PREAMBLE_ARG_Y
541 #undef TILEX_LOW_BITS
542 #undef TILEY_LOW_BITS