From: Siarhei Siamashka Date: Tue, 26 Jun 2012 02:36:52 +0000 (-0400) Subject: Add scaled nearest repeat fast paths X-Git-Tag: pixman-0.27.4~28 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=aff796d6cee4cb81f0352c2f7d0c994229bd5ca1;p=platform%2Fupstream%2Fpixman.git Add scaled nearest repeat fast paths Before this patch it was often faster to scale and repeat in two passes because each pass used a fast path vs. the slow path that the single pass approach takes. This makes it so that the single pass approach has competitive performance. --- diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h index f56264e..3a7cb2b 100644 --- a/pixman/pixman-arm-common.h +++ b/pixman/pixman-arm-common.h @@ -236,7 +236,8 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ dst_type * dst, \ const src_type * src, \ pixman_fixed_t vx, \ - pixman_fixed_t unit_x); \ + pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx); \ \ static force_inline void \ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \ @@ -248,7 +249,8 @@ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type * pd, \ pixman_bool_t zero_src) \ { \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ - vx, unit_x);\ + vx, unit_x, \ + max_vx); \ } \ \ FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op, \ @@ -259,13 +261,17 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op, \ src_type, dst_type, NONE) \ FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op, \ - src_type, dst_type, PAD) + src_type, dst_type, PAD) \ +FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op, \ + src_type, dst_type, NORMAL) /* Provide entries for the fast path table */ #define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func) \ SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func), \ SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func) + SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func) #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op, \ src_type, dst_type) \ @@ -276,6 +282,7 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype ( \ const src_type * src, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ + pixman_fixed_t max_vx, \ const uint8_t * mask); \ \ static force_inline void \ @@ -292,6 +299,7 @@ scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t * mask, \ return; \ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps, \ vx, unit_x, \ + max_vx, \ mask); \ } \ \ @@ -303,13 +311,17 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ scaled_nearest_scanline_##cputype##_##name##_##op,\ - src_type, uint8_t, dst_type, PAD, TRUE, FALSE) + src_type, uint8_t, dst_type, PAD, TRUE, FALSE) \ +FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ + scaled_nearest_scanline_##cputype##_##name##_##op,\ + src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE) /* Provide entries for the fast path table */ #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func) \ SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func), \ SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func), \ - SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func) + SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func), \ + SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func) /*****************************************************************************/ diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h index 97adc6a..1673b08 100644 --- a/pixman/pixman-arm-neon-asm.h +++ b/pixman/pixman-arm-neon-asm.h @@ -212,27 +212,39 @@ .macro pixld1_s elem_size, reg1, mem_operand .if elem_size == 16 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #1 mov TMP2, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP2, mem_operand, TMP2, asl #1 vld1.16 {d®1&[0]}, [TMP1, :16] mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #1 vld1.16 {d®1&[1]}, [TMP2, :16] mov TMP2, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP2, mem_operand, TMP2, asl #1 vld1.16 {d®1&[2]}, [TMP1, :16] vld1.16 {d®1&[3]}, [TMP2, :16] .elseif elem_size == 32 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #2 mov TMP2, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP2, mem_operand, TMP2, asl #2 vld1.32 {d®1&[0]}, [TMP1, :32] vld1.32 {d®1&[1]}, [TMP2, :32] @@ -242,7 +254,7 @@ .endm .macro pixld2_s elem_size, reg1, reg2, mem_operand -.if elem_size == 32 +.if 0 /* elem_size == 32 */ mov TMP1, VX, asr #16 add VX, VX, UNIT_X, asl #1 add TMP1, mem_operand, TMP1, asl #2 @@ -268,12 +280,16 @@ .macro pixld0_s elem_size, reg1, idx, mem_operand .if elem_size == 16 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #1 vld1.16 {d®1&[idx]}, [TMP1, :16] .elseif elem_size == 32 mov TMP1, VX, asr #16 - add VX, VX, UNIT_X + adds VX, VX, UNIT_X +5: subpls VX, VX, SRC_WIDTH_FIXED + bpl 5b add TMP1, mem_operand, TMP1, asl #2 vld1.32 {d®1&[idx]}, [TMP1, :32] .endif @@ -964,15 +980,17 @@ fname: TMP1 .req r4 TMP2 .req r5 DST_R .req r6 + SRC_WIDTH_FIXED .req r7 .macro pixld_src x:vararg pixld_s x .endm ldr UNIT_X, [sp] - push {r4-r6, lr} + push {r4-r8, lr} + ldr SRC_WIDTH_FIXED, [sp, #(24 + 4)] .if mask_bpp != 0 - ldr MASK, [sp, #(16 + 4)] + ldr MASK, [sp, #(24 + 8)] .endif .else /* @@ -1044,7 +1062,7 @@ fname: cleanup .if use_nearest_scaling != 0 - pop {r4-r6, pc} /* exit */ + pop {r4-r8, pc} /* exit */ .else bx lr /* exit */ .endif @@ -1058,7 +1076,7 @@ fname: cleanup .if use_nearest_scaling != 0 - pop {r4-r6, pc} /* exit */ + pop {r4-r8, pc} /* exit */ .unreq DST_R .unreq SRC @@ -1069,6 +1087,7 @@ fname: .unreq TMP2 .unreq DST_W .unreq MASK + .unreq SRC_WIDTH_FIXED .else bx lr /* exit */ diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S index 8fe1b50..b438001 100644 --- a/pixman/pixman-arm-simd-asm.S +++ b/pixman/pixman-arm-simd-asm.S @@ -355,49 +355,57 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6 prefetch_braking_distance pixman_asm_function fname - W .req r0 - DST .req r1 - SRC .req r2 - VX .req r3 - UNIT_X .req ip - TMP1 .req r4 - TMP2 .req r5 - VXMASK .req r6 - PF_OFFS .req r7 + W .req r0 + DST .req r1 + SRC .req r2 + VX .req r3 + UNIT_X .req ip + TMP1 .req r4 + TMP2 .req r5 + VXMASK .req r6 + PF_OFFS .req r7 + SRC_WIDTH_FIXED .req r8 ldr UNIT_X, [sp] - push {r4, r5, r6, r7} + push {r4, r5, r6, r7, r8, r10} mvn VXMASK, #((1 << bpp_shift) - 1) + ldr SRC_WIDTH_FIXED, [sp, #28] /* define helper macro */ .macro scale_2_pixels ldr&t TMP1, [SRC, TMP1] - and TMP2, VXMASK, VX, lsr #(16 - bpp_shift) - add VX, VX, UNIT_X + and TMP2, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X str&t TMP1, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b ldr&t TMP2, [SRC, TMP2] - and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) - add VX, VX, UNIT_X + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X str&t TMP2, [DST], #(1 << bpp_shift) +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b .endm /* now do the scaling */ - and TMP1, VXMASK, VX, lsr #(16 - bpp_shift) - add VX, VX, UNIT_X + and TMP1, VXMASK, VX, asr #(16 - bpp_shift) + adds VX, VX, UNIT_X +9: subpls VX, VX, SRC_WIDTH_FIXED + bpl 9b subs W, W, #(8 + prefetch_braking_distance) blt 2f /* calculate prefetch offset */ mov PF_OFFS, #prefetch_distance mla PF_OFFS, UNIT_X, PF_OFFS, VX 1: /* main loop, process 8 pixels per iteration with prefetch */ - subs W, W, #8 + pld [SRC, PF_OFFS, asr #(16 - bpp_shift)] add PF_OFFS, UNIT_X, lsl #3 scale_2_pixels scale_2_pixels scale_2_pixels scale_2_pixels - pld [SRC, PF_OFFS, lsr #(16 - bpp_shift)] + subs W, W, #8 bge 1b 2: subs W, W, #(4 - 8 - prefetch_braking_distance) @@ -426,8 +434,9 @@ pixman_asm_function fname .unreq TMP2 .unreq VXMASK .unreq PF_OFFS + .unreq SRC_WIDTH_FIXED /* return */ - pop {r4, r5, r6, r7} + pop {r4, r5, r6, r7, r8, r10} bx lr .endfunc .endm diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c index 86ed821..22bfd30 100644 --- a/pixman/pixman-fast-path.c +++ b/pixman/pixman-fast-path.c @@ -1415,13 +1415,13 @@ scaled_nearest_scanline_565_565_SRC (uint16_t * dst, uint16_t tmp1, tmp2, tmp3, tmp4; while ((w -= 4) >= 0) { - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; + tmp3 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; + tmp4 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; *dst++ = tmp1; *dst++ = tmp2; @@ -1430,15 +1430,15 @@ scaled_nearest_scanline_565_565_SRC (uint16_t * dst, } if (w & 2) { - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; *dst++ = tmp1; *dst++ = tmp2; } if (w & 1) - *dst++ = src[pixman_fixed_to_int (vx)]; + *dst = *(src + pixman_fixed_to_int (vx)); } FAST_NEAREST_MAINLOOP (565_565_cover_SRC, diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h index 3a3c658..7f2e404 100644 --- a/pixman/pixman-inlines.h +++ b/pixman/pixman-inlines.h @@ -271,7 +271,7 @@ scanline_func_name (dst_type_t *dst, \ int32_t w, \ pixman_fixed_t vx, \ pixman_fixed_t unit_x, \ - pixman_fixed_t max_vx, \ + pixman_fixed_t src_width_fixed, \ pixman_bool_t fully_transparent_src) \ { \ uint32_t d; \ @@ -287,25 +287,25 @@ scanline_func_name (dst_type_t *dst, \ \ while ((w -= 2) >= 0) \ { \ - x1 = vx >> 16; \ + x1 = pixman_fixed_to_int (vx); \ vx += unit_x; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ { \ /* This works because we know that unit_x is positive */ \ - while (vx >= max_vx) \ - vx -= max_vx; \ + while (vx >= 0) \ + vx -= src_width_fixed; \ } \ - s1 = src[x1]; \ + s1 = *(src + x1); \ \ - x2 = vx >> 16; \ + x2 = pixman_fixed_to_int (vx); \ vx += unit_x; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ { \ /* This works because we know that unit_x is positive */ \ - while (vx >= max_vx) \ - vx -= max_vx; \ + while (vx >= 0) \ + vx -= src_width_fixed; \ } \ - s2 = src[x2]; \ + s2 = *(src + x2); \ \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ { \ @@ -349,8 +349,8 @@ scanline_func_name (dst_type_t *dst, \ \ if (w & 1) \ { \ - x1 = vx >> 16; \ - s1 = src[x1]; \ + x1 = pixman_fixed_to_int (vx); \ + s1 = *(src + x1); \ \ if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER) \ { \ @@ -388,7 +388,7 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, mask_type_t *mask_line; \ src_type_t *src_first_line; \ int y; \ - pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */ \ + pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width); \ pixman_fixed_t max_vy; \ pixman_vector_t v; \ pixman_fixed_t vx, vy; \ @@ -434,11 +434,10 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ { \ - /* Clamp repeating positions inside the actual samples */ \ - max_vx = src_image->bits.width << 16; \ - max_vy = src_image->bits.height << 16; \ + max_vy = pixman_int_to_fixed (src_image->bits.height); \ \ - repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx); \ + /* Clamp repeating positions inside the actual samples */ \ + repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed); \ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ } \ \ @@ -460,7 +459,7 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, mask_line += mask_stride; \ } \ \ - y = vy >> 16; \ + y = pixman_fixed_to_int (vy); \ vy += unit_y; \ if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy); \ @@ -470,18 +469,21 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, src = src_first_line + src_stride * y; \ if (left_pad > 0) \ { \ - scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE); \ + scanline_func (mask, dst, \ + src + src_image->bits.width - src_image->bits.width + 1, \ + left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE); \ } \ if (width > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ - dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + dst + left_pad, src + src_image->bits.width, width, \ + vx - src_width_fixed, unit_x, src_width_fixed, FALSE); \ } \ if (right_pad > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ - dst + left_pad + width, src + src_image->bits.width - 1, \ - right_pad, 0, 0, 0, FALSE); \ + dst + left_pad + width, src + src_image->bits.width, \ + right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE); \ } \ } \ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ @@ -489,29 +491,34 @@ fast_composite_scaled_nearest ## scale_func_name (pixman_implementation_t *imp, static const src_type_t zero[1] = { 0 }; \ if (y < 0 || y >= src_image->bits.height) \ { \ - scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE); \ + scanline_func (mask, dst, zero + 1, left_pad + width + right_pad, \ + -pixman_fixed_e, 0, src_width_fixed, TRUE); \ continue; \ } \ src = src_first_line + src_stride * y; \ if (left_pad > 0) \ { \ - scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE); \ + scanline_func (mask, dst, zero + 1, left_pad, \ + -pixman_fixed_e, 0, src_width_fixed, TRUE); \ } \ if (width > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad), \ - dst + left_pad, src, width, vx, unit_x, 0, FALSE); \ + dst + left_pad, src + src_image->bits.width, width, \ + vx - src_width_fixed, unit_x, src_width_fixed, FALSE); \ } \ if (right_pad > 0) \ { \ scanline_func (mask + (mask_is_solid ? 0 : left_pad + width), \ - dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE); \ + dst + left_pad + width, zero + 1, right_pad, \ + -pixman_fixed_e, 0, src_width_fixed, TRUE); \ } \ } \ else \ { \ src = src_first_line + src_stride * y; \ - scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE); \ + scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed, \ + unit_x, src_width_fixed, FALSE); \ } \ } \ } diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index cf21ef8..efed310 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx, + pixman_fixed_t src_width_fixed, pixman_bool_t fully_transparent_src) { uint32_t s, d; @@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, while (w && ((unsigned long)pd & 15)) { d = *pd; - s = combine1 (ps + (vx >> 16), pm); + s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); if (pm) @@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, __m128i tmp; uint32_t tmp1, tmp2, tmp3, tmp4; - tmp1 = ps[vx >> 16]; + tmp1 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = ps[vx >> 16]; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(ps + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); @@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, while (w) { d = *pd; - s = combine1 (ps + (vx >> 16), pm); + s = combine1 (ps + pixman_fixed_to_int (vx), pm); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; *pd++ = core_combine_over_u_pixel_sse2 (s, d); if (pm) @@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, scaled_nearest_scanline_sse2_8888_8888_OVER, uint32_t, uint32_t, PAD) +FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, + scaled_nearest_scanline_sse2_8888_8888_OVER, + uint32_t, uint32_t, NORMAL) static force_inline void scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, @@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, int32_t w, pixman_fixed_t vx, pixman_fixed_t unit_x, - pixman_fixed_t max_vx, + pixman_fixed_t src_width_fixed, pixman_bool_t zero_src) { __m128i xmm_mask; @@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, while (w && (unsigned long)dst & 15) { - uint32_t s = src[pixman_fixed_to_int (vx)]; + uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; if (s) { @@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, { uint32_t tmp1, tmp2, tmp3, tmp4; - tmp1 = src[pixman_fixed_to_int (vx)]; + tmp1 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp2 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp2 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp3 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp3 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; - tmp4 = src[pixman_fixed_to_int (vx)]; + while (vx >= 0) + vx -= src_width_fixed; + tmp4 = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); @@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, while (w) { - uint32_t s = src[pixman_fixed_to_int (vx)]; + uint32_t s = *(src + pixman_fixed_to_int (vx)); vx += unit_x; + while (vx >= 0) + vx -= src_width_fixed; if (s) { @@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, scaled_nearest_scanline_sse2_8888_n_8888_OVER, uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) @@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] = SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),