From: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date: Tue, 26 Jun 2012 02:36:52 +0000 (-0400)
Subject: Add scaled nearest repeat fast paths
X-Git-Tag: pixman-0.27.4~28
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=aff796d6cee4cb81f0352c2f7d0c994229bd5ca1;p=platform%2Fupstream%2Fpixman.git

Add scaled nearest repeat fast paths

Before this patch it was often faster to scale and repeat
in two passes because each pass used a fast path vs.
the slow path that the single pass approach takes. This
makes it so that the single pass approach has competitive
performance.
---

diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h
index f56264e..3a7cb2b 100644
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -236,7 +236,8 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    dst_type *       dst,      \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x);  \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx);  \
                                                                               \
 static force_inline void                                                      \
 scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
@@ -248,7 +249,8 @@ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
                                                    pixman_bool_t    zero_src) \
 {                                                                             \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
-                                                                  vx, unit_x);\
+                                                                  vx, unit_x, \
+                                                                  max_vx);    \
 }                                                                             \
                                                                               \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
@@ -259,13 +261,17 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
                        src_type, dst_type, NONE)                              \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, PAD)
+                       src_type, dst_type, PAD)                               \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NORMAL)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
     SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
 
 #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
                                                   src_type, dst_type)         \
@@ -276,6 +282,7 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
                                                    pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
                                                    const uint8_t *  mask);    \
                                                                               \
 static force_inline void                                                      \
@@ -292,6 +299,7 @@ scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
 	return;                                                               \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
                                                                   vx, unit_x, \
+                                                                  max_vx,     \
                                                                   mask);      \
 }                                                                             \
                                                                               \
@@ -303,13 +311,17 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
                               src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
 FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
 
 /*****************************************************************************/
 
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h
index 97adc6a..1673b08 100644
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -212,27 +212,39 @@
 .macro pixld1_s elem_size, reg1, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #1
     vld1.16 {d&reg1&[0]}, [TMP1, :16]
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     vld1.16 {d&reg1&[1]}, [TMP2, :16]
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #1
     vld1.16 {d&reg1&[2]}, [TMP1, :16]
     vld1.16 {d&reg1&[3]}, [TMP2, :16]
 .elseif elem_size == 32
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #2
     vld1.32 {d&reg1&[0]}, [TMP1, :32]
     vld1.32 {d&reg1&[1]}, [TMP2, :32]
@@ -242,7 +254,7 @@
 .endm
 
 .macro pixld2_s elem_size, reg1, reg2, mem_operand
-.if elem_size == 32
+.if 0 /* elem_size == 32 */
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
     add     TMP1, mem_operand, TMP1, asl #2
@@ -268,12 +280,16 @@
 .macro pixld0_s elem_size, reg1, idx, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
 .elseif elem_size == 32
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #2
     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
 .endif
@@ -964,15 +980,17 @@ fname:
     TMP1        .req        r4
     TMP2        .req        r5
     DST_R       .req        r6
+    SRC_WIDTH_FIXED .req        r7
 
     .macro pixld_src x:vararg
         pixld_s x
     .endm
 
     ldr         UNIT_X, [sp]
-    push        {r4-r6, lr}
+    push        {r4-r8, lr}
+    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
     .if mask_bpp != 0
-    ldr         MASK, [sp, #(16 + 4)]
+    ldr         MASK, [sp, #(24 + 8)]
     .endif
 .else
     /*
@@ -1044,7 +1062,7 @@ fname:
 
     cleanup
 .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
 .else
     bx          lr  /* exit */
 .endif
@@ -1058,7 +1076,7 @@ fname:
     cleanup
 
 .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
 
     .unreq      DST_R
     .unreq      SRC
@@ -1069,6 +1087,7 @@ fname:
     .unreq      TMP2
     .unreq      DST_W
     .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
 
 .else
     bx          lr  /* exit */
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index 8fe1b50..b438001 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -355,49 +355,57 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
                                       prefetch_braking_distance
 
 pixman_asm_function fname
-	W	.req	r0
-	DST	.req	r1
-	SRC	.req	r2
-	VX	.req	r3
-	UNIT_X	.req	ip
-	TMP1	.req	r4
-	TMP2	.req	r5
-	VXMASK	.req	r6
-	PF_OFFS	.req	r7
+	W		.req	r0
+	DST		.req	r1
+	SRC		.req	r2
+	VX		.req	r3
+	UNIT_X		.req	ip
+	TMP1		.req	r4
+	TMP2		.req	r5
+	VXMASK		.req	r6
+	PF_OFFS		.req	r7
+	SRC_WIDTH_FIXED	.req	r8
 
 	ldr	UNIT_X, [sp]
-	push	{r4, r5, r6, r7}
+	push	{r4, r5, r6, r7, r8, r10}
 	mvn	VXMASK, #((1 << bpp_shift) - 1)
+	ldr	SRC_WIDTH_FIXED, [sp, #28]
 
 	/* define helper macro */
 	.macro	scale_2_pixels
 		ldr&t	TMP1, [SRC, TMP1]
-		and	TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
-		add	VX, VX, UNIT_X
+		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
 		str&t	TMP1, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
 
 		ldr&t	TMP2, [SRC, TMP2]
-		and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-		add	VX, VX, UNIT_X
+		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
 		str&t	TMP2, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
 	.endm
 
 	/* now do the scaling */
-	and	TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-	add	VX, VX, UNIT_X
+	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+	adds	VX, VX, UNIT_X
+9:	subpls	VX, VX, SRC_WIDTH_FIXED
+	bpl	9b
 	subs	W, W, #(8 + prefetch_braking_distance)
 	blt	2f
 	/* calculate prefetch offset */
 	mov	PF_OFFS, #prefetch_distance
 	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
 1:	/* main loop, process 8 pixels per iteration with prefetch */
-	subs	W, W, #8
+	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
 	add	PF_OFFS, UNIT_X, lsl #3
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
 	scale_2_pixels
-	pld	[SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+	subs	W, W, #8
 	bge	1b
 2:
 	subs	W, W, #(4 - 8 - prefetch_braking_distance)
@@ -426,8 +434,9 @@ pixman_asm_function fname
 	.unreq	TMP2
 	.unreq	VXMASK
 	.unreq	PF_OFFS
+	.unreq  SRC_WIDTH_FIXED
 	/* return */
-	pop	{r4, r5, r6, r7}
+	pop	{r4, r5, r6, r7, r8, r10}
 	bx	lr
 .endfunc
 .endm
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 86ed821..22bfd30 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1415,13 +1415,13 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
     uint16_t tmp1, tmp2, tmp3, tmp4;
     while ((w -= 4) >= 0)
     {
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
+	tmp3 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
+	tmp4 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
 	*dst++ = tmp1;
 	*dst++ = tmp2;
@@ -1430,15 +1430,15 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
     }
     if (w & 2)
     {
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
 	*dst++ = tmp1;
 	*dst++ = tmp2;
     }
     if (w & 1)
-	*dst++ = src[pixman_fixed_to_int (vx)];
+	*dst = *(src + pixman_fixed_to_int (vx));
 }
 
 FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h
index 3a3c658..7f2e404 100644
--- a/pixman/pixman-inlines.h
+++ b/pixman/pixman-inlines.h
@@ -271,7 +271,7 @@ scanline_func_name (dst_type_t       *dst,							\
 		    int32_t           w,							\
 		    pixman_fixed_t    vx,							\
 		    pixman_fixed_t    unit_x,							\
-		    pixman_fixed_t    max_vx,							\
+		    pixman_fixed_t    src_width_fixed,						\
 		    pixman_bool_t     fully_transparent_src)					\
 {												\
 	uint32_t   d;										\
@@ -287,25 +287,25 @@ scanline_func_name (dst_type_t       *dst,							\
 												\
 	while ((w -= 2) >= 0)									\
 	{											\
-	    x1 = vx >> 16;									\
+	    x1 = pixman_fixed_to_int (vx);							\
 	    vx += unit_x;									\
 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    {											\
 		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
 	    }											\
-	    s1 = src[x1];									\
+	    s1 = *(src + x1);									\
 												\
-	    x2 = vx >> 16;									\
+	    x2 = pixman_fixed_to_int (vx);							\
 	    vx += unit_x;									\
 	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    {											\
 		/* This works because we know that unit_x is positive */			\
-		while (vx >= max_vx)								\
-		    vx -= max_vx;								\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
 	    }											\
-	    s2 = src[x2];									\
+	    s2 = *(src + x2);									\
 												\
 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
 	    {											\
@@ -349,8 +349,8 @@ scanline_func_name (dst_type_t       *dst,							\
 												\
 	if (w & 1)										\
 	{											\
-	    x1 = vx >> 16;									\
-	    s1 = src[x1];									\
+	    x1 = pixman_fixed_to_int (vx);							\
+	    s1 = *(src + x1);									\
 												\
 	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
 	    {											\
@@ -388,7 +388,7 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
     mask_type_t *mask_line;									\
     src_type_t *src_first_line;									\
     int       y;										\
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
     pixman_fixed_t max_vy;									\
     pixman_vector_t v;										\
     pixman_fixed_t vx, vy;									\
@@ -434,11 +434,10 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
 												\
     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
     {												\
-	/* Clamp repeating positions inside the actual samples */				\
-	max_vx = src_image->bits.width << 16;							\
-	max_vy = src_image->bits.height << 16;							\
+	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
 												\
-	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	/* Clamp repeating positions inside the actual samples */				\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
 	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
     }												\
 												\
@@ -460,7 +459,7 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
 	    mask_line += mask_stride;								\
 	}											\
 												\
-	y = vy >> 16;										\
+	y = pixman_fixed_to_int (vy);								\
 	vy += unit_y;										\
 	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
 	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
@@ -470,18 +469,21 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
 	    src = src_first_line + src_stride * y;						\
 	    if (left_pad > 0)									\
 	    {											\
-		scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);			\
+		scanline_func (mask, dst,							\
+			       src + src_image->bits.width - src_image->bits.width + 1,		\
+			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
 	    }											\
 	    if (width > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
 	    }											\
 	    if (right_pad > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, src + src_image->bits.width - 1,		\
-			       right_pad, 0, 0, 0, FALSE);					\
+			       dst + left_pad + width, src + src_image->bits.width,		\
+			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
 	    }											\
 	}											\
 	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
@@ -489,29 +491,34 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
 	    static const src_type_t zero[1] = { 0 };						\
 	    if (y < 0 || y >= src_image->bits.height)						\
 	    {											\
-		scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);	\
+		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
 		continue;									\
 	    }											\
 	    src = src_first_line + src_stride * y;						\
 	    if (left_pad > 0)									\
 	    {											\
-		scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);			\
+		scanline_func (mask, dst, zero + 1, left_pad,					\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
 	    }											\
 	    if (width > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
-			       dst + left_pad, src, width, vx, unit_x, 0, FALSE);		\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
 	    }											\
 	    if (right_pad > 0)									\
 	    {											\
 		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
-			       dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);		\
+			       dst + left_pad + width, zero + 1, right_pad,			\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
 	    }											\
 	}											\
 	else											\
 	{											\
 	    src = src_first_line + src_stride * y;						\
-	    scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);			\
+	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
+			   unit_x, src_width_fixed, FALSE);					\
 	}											\
     }												\
 }
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index cf21ef8..efed310 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                              int32_t         w,
                                              pixman_fixed_t  vx,
                                              pixman_fixed_t  unit_x,
-                                             pixman_fixed_t  max_vx,
+                                             pixman_fixed_t  src_width_fixed,
                                              pixman_bool_t   fully_transparent_src)
 {
     uint32_t s, d;
@@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
     while (w && ((unsigned long)pd & 15))
     {
 	d = *pd;
-	s = combine1 (ps + (vx >> 16), pm);
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
 	if (pm)
@@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
 	__m128i tmp;
 	uint32_t tmp1, tmp2, tmp3, tmp4;
 
-	tmp1 = ps[vx >> 16];
+	tmp1 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = ps[vx >> 16];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = ps[vx >> 16];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = ps[vx >> 16];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(ps + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
 
@@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
     while (w)
     {
 	d = *pd;
-	s = combine1 (ps + (vx >> 16), pm);
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
 	if (pm)
@@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
 		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
 
 static force_inline void
 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
@@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 					       int32_t          w,
 					       pixman_fixed_t   vx,
 					       pixman_fixed_t   unit_x,
-					       pixman_fixed_t   max_vx,
+					       pixman_fixed_t   src_width_fixed,
 					       pixman_bool_t    zero_src)
 {
     __m128i xmm_mask;
@@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 
     while (w && (unsigned long)dst & 15)
     {
-	uint32_t s = src[pixman_fixed_to_int (vx)];
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	if (s)
 	{
@@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
     {
 	uint32_t tmp1, tmp2, tmp3, tmp4;
 
-	tmp1 = src[pixman_fixed_to_int (vx)];
+	tmp1 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp2 = src[pixman_fixed_to_int (vx)];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp3 = src[pixman_fixed_to_int (vx)];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
-	tmp4 = src[pixman_fixed_to_int (vx)];
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
 
@@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 
     while (w)
     {
-	uint32_t s = src[pixman_fixed_to_int (vx)];
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
 	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
 
 	if (s)
 	{
@@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
 
@@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),