Add scaled nearest repeat fast paths
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>
Tue, 26 Jun 2012 02:36:52 +0000 (22:36 -0400)
committerSøren Sandmann Pedersen <ssp@redhat.com>
Wed, 26 Sep 2012 04:03:10 +0000 (00:03 -0400)
Before this patch it was often faster to scale and repeat
in two passes because each pass used a fast path vs.
the slow path that the single pass approach takes. This
makes it so that the single pass approach has competitive
performance.

pixman/pixman-arm-common.h
pixman/pixman-arm-neon-asm.h
pixman/pixman-arm-simd-asm.S
pixman/pixman-fast-path.c
pixman/pixman-inlines.h
pixman/pixman-sse2.c

index f56264e..3a7cb2b 100644 (file)
@@ -236,7 +236,8 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    dst_type *       dst,      \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x);  \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx);  \
                                                                               \
 static force_inline void                                                      \
 scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
@@ -248,7 +249,8 @@ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
                                                    pixman_bool_t    zero_src) \
 {                                                                             \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
-                                                                  vx, unit_x);\
+                                                                  vx, unit_x, \
+                                                                  max_vx);    \
 }                                                                             \
                                                                               \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
@@ -259,13 +261,17 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
                        src_type, dst_type, NONE)                              \
 FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
                        scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, PAD)
+                       src_type, dst_type, PAD)                               \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NORMAL)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
     SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
     SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
 
 #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
                                                   src_type, dst_type)         \
@@ -276,6 +282,7 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                    const src_type * src,      \
                                                    pixman_fixed_t   vx,       \
                                                    pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
                                                    const uint8_t *  mask);    \
                                                                               \
 static force_inline void                                                      \
@@ -292,6 +299,7 @@ scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
        return;                                                               \
     pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
                                                                   vx, unit_x, \
+                                                                  max_vx,     \
                                                                   mask);      \
 }                                                                             \
                                                                               \
@@ -303,13 +311,17 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
                               src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
 FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
                               scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
 
 /* Provide entries for the fast path table */
 #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
     SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
 
 /*****************************************************************************/
 
index 97adc6a..1673b08 100644 (file)
 .macro pixld1_s elem_size, reg1, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #1
     vld1.16 {d&reg1&[0]}, [TMP1, :16]
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     vld1.16 {d&reg1&[1]}, [TMP2, :16]
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #1
     vld1.16 {d&reg1&[2]}, [TMP1, :16]
     vld1.16 {d&reg1&[3]}, [TMP2, :16]
 .elseif elem_size == 32
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #2
     mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP2, mem_operand, TMP2, asl #2
     vld1.32 {d&reg1&[0]}, [TMP1, :32]
     vld1.32 {d&reg1&[1]}, [TMP2, :32]
 .endm
 
 .macro pixld2_s elem_size, reg1, reg2, mem_operand
-.if elem_size == 32
+.if 0 /* elem_size == 32 */
     mov     TMP1, VX, asr #16
     add     VX, VX, UNIT_X, asl #1
     add     TMP1, mem_operand, TMP1, asl #2
 .macro pixld0_s elem_size, reg1, idx, mem_operand
 .if elem_size == 16
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #1
     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
 .elseif elem_size == 32
     mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
     add     TMP1, mem_operand, TMP1, asl #2
     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
 .endif
@@ -964,15 +980,17 @@ fname:
     TMP1        .req        r4
     TMP2        .req        r5
     DST_R       .req        r6
+    SRC_WIDTH_FIXED .req        r7
 
     .macro pixld_src x:vararg
         pixld_s x
     .endm
 
     ldr         UNIT_X, [sp]
-    push        {r4-r6, lr}
+    push        {r4-r8, lr}
+    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
     .if mask_bpp != 0
-    ldr         MASK, [sp, #(16 + 4)]
+    ldr         MASK, [sp, #(24 + 8)]
     .endif
 .else
     /*
@@ -1044,7 +1062,7 @@ fname:
 
     cleanup
 .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
 .else
     bx          lr  /* exit */
 .endif
@@ -1058,7 +1076,7 @@ fname:
     cleanup
 
 .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
 
     .unreq      DST_R
     .unreq      SRC
@@ -1069,6 +1087,7 @@ fname:
     .unreq      TMP2
     .unreq      DST_W
     .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
 
 .else
     bx          lr  /* exit */
index 8fe1b50..b438001 100644 (file)
@@ -355,49 +355,57 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
                                       prefetch_braking_distance
 
 pixman_asm_function fname
-       W       .req    r0
-       DST     .req    r1
-       SRC     .req    r2
-       VX      .req    r3
-       UNIT_X  .req    ip
-       TMP1    .req    r4
-       TMP2    .req    r5
-       VXMASK  .req    r6
-       PF_OFFS .req    r7
+       W               .req    r0
+       DST             .req    r1
+       SRC             .req    r2
+       VX              .req    r3
+       UNIT_X          .req    ip
+       TMP1            .req    r4
+       TMP2            .req    r5
+       VXMASK          .req    r6
+       PF_OFFS         .req    r7
+       SRC_WIDTH_FIXED .req    r8
 
        ldr     UNIT_X, [sp]
-       push    {r4, r5, r6, r7}
+       push    {r4, r5, r6, r7, r8, r10}
        mvn     VXMASK, #((1 << bpp_shift) - 1)
+       ldr     SRC_WIDTH_FIXED, [sp, #28]
 
        /* define helper macro */
        .macro  scale_2_pixels
                ldr&t   TMP1, [SRC, TMP1]
-               and     TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
-               add     VX, VX, UNIT_X
+               and     TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+               adds    VX, VX, UNIT_X
                str&t   TMP1, [DST], #(1 << bpp_shift)
+9:             subpls  VX, VX, SRC_WIDTH_FIXED
+               bpl     9b
 
                ldr&t   TMP2, [SRC, TMP2]
-               and     TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-               add     VX, VX, UNIT_X
+               and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+               adds    VX, VX, UNIT_X
                str&t   TMP2, [DST], #(1 << bpp_shift)
+9:             subpls  VX, VX, SRC_WIDTH_FIXED
+               bpl     9b
        .endm
 
        /* now do the scaling */
-       and     TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-       add     VX, VX, UNIT_X
+       and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+       adds    VX, VX, UNIT_X
+9:     subpls  VX, VX, SRC_WIDTH_FIXED
+       bpl     9b
        subs    W, W, #(8 + prefetch_braking_distance)
        blt     2f
        /* calculate prefetch offset */
        mov     PF_OFFS, #prefetch_distance
        mla     PF_OFFS, UNIT_X, PF_OFFS, VX
 1:     /* main loop, process 8 pixels per iteration with prefetch */
-       subs    W, W, #8
+       pld     [SRC, PF_OFFS, asr #(16 - bpp_shift)]
        add     PF_OFFS, UNIT_X, lsl #3
        scale_2_pixels
        scale_2_pixels
        scale_2_pixels
        scale_2_pixels
-       pld     [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+       subs    W, W, #8
        bge     1b
 2:
        subs    W, W, #(4 - 8 - prefetch_braking_distance)
@@ -426,8 +434,9 @@ pixman_asm_function fname
        .unreq  TMP2
        .unreq  VXMASK
        .unreq  PF_OFFS
+       .unreq  SRC_WIDTH_FIXED
        /* return */
-       pop     {r4, r5, r6, r7}
+       pop     {r4, r5, r6, r7, r8, r10}
        bx      lr
 .endfunc
 .endm
index 86ed821..22bfd30 100644 (file)
@@ -1415,13 +1415,13 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
     uint16_t tmp1, tmp2, tmp3, tmp4;
     while ((w -= 4) >= 0)
     {
-       tmp1 = src[pixman_fixed_to_int (vx)];
+       tmp1 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp2 = src[pixman_fixed_to_int (vx)];
+       tmp2 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp3 = src[pixman_fixed_to_int (vx)];
+       tmp3 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp4 = src[pixman_fixed_to_int (vx)];
+       tmp4 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
        *dst++ = tmp1;
        *dst++ = tmp2;
@@ -1430,15 +1430,15 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
     }
     if (w & 2)
     {
-       tmp1 = src[pixman_fixed_to_int (vx)];
+       tmp1 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp2 = src[pixman_fixed_to_int (vx)];
+       tmp2 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
        *dst++ = tmp1;
        *dst++ = tmp2;
     }
     if (w & 1)
-       *dst++ = src[pixman_fixed_to_int (vx)];
+       *dst = *(src + pixman_fixed_to_int (vx));
 }
 
 FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
index 3a3c658..7f2e404 100644 (file)
@@ -271,7 +271,7 @@ scanline_func_name (dst_type_t       *dst,                                                  \
                    int32_t           w,                                                        \
                    pixman_fixed_t    vx,                                                       \
                    pixman_fixed_t    unit_x,                                                   \
-                   pixman_fixed_t    max_vx,                                                   \
+                   pixman_fixed_t    src_width_fixed,                                          \
                    pixman_bool_t     fully_transparent_src)                                    \
 {                                                                                              \
        uint32_t   d;                                                                           \
@@ -287,25 +287,25 @@ scanline_func_name (dst_type_t       *dst,                                                        \
                                                                                                \
        while ((w -= 2) >= 0)                                                                   \
        {                                                                                       \
-           x1 = vx >> 16;                                                                      \
+           x1 = pixman_fixed_to_int (vx);                                                      \
            vx += unit_x;                                                                       \
            if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                          \
            {                                                                                   \
                /* This works because we know that unit_x is positive */                        \
-               while (vx >= max_vx)                                                            \
-                   vx -= max_vx;                                                               \
+               while (vx >= 0)                                                                 \
+                   vx -= src_width_fixed;                                                      \
            }                                                                                   \
-           s1 = src[x1];                                                                       \
+           s1 = *(src + x1);                                                                   \
                                                                                                \
-           x2 = vx >> 16;                                                                      \
+           x2 = pixman_fixed_to_int (vx);                                                      \
            vx += unit_x;                                                                       \
            if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                          \
            {                                                                                   \
                /* This works because we know that unit_x is positive */                        \
-               while (vx >= max_vx)                                                            \
-                   vx -= max_vx;                                                               \
+               while (vx >= 0)                                                                 \
+                   vx -= src_width_fixed;                                                      \
            }                                                                                   \
-           s2 = src[x2];                                                                       \
+           s2 = *(src + x2);                                                                   \
                                                                                                \
            if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)                                             \
            {                                                                                   \
@@ -349,8 +349,8 @@ scanline_func_name (dst_type_t       *dst,                                                  \
                                                                                                \
        if (w & 1)                                                                              \
        {                                                                                       \
-           x1 = vx >> 16;                                                                      \
-           s1 = src[x1];                                                                       \
+           x1 = pixman_fixed_to_int (vx);                                                      \
+           s1 = *(src + x1);                                                                   \
                                                                                                \
            if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)                                             \
            {                                                                                   \
@@ -388,7 +388,7 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
     mask_type_t *mask_line;                                                                    \
     src_type_t *src_first_line;                                                                        \
     int       y;                                                                               \
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */           \
+    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);              \
     pixman_fixed_t max_vy;                                                                     \
     pixman_vector_t v;                                                                         \
     pixman_fixed_t vx, vy;                                                                     \
@@ -434,11 +434,10 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
                                                                                                \
     if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                                 \
     {                                                                                          \
-       /* Clamp repeating positions inside the actual samples */                               \
-       max_vx = src_image->bits.width << 16;                                                   \
-       max_vy = src_image->bits.height << 16;                                                  \
+       max_vy = pixman_int_to_fixed (src_image->bits.height);                                  \
                                                                                                \
-       repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);                                             \
+       /* Clamp repeating positions inside the actual samples */                               \
+       repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);                                    \
        repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);                                             \
     }                                                                                          \
                                                                                                \
@@ -460,7 +459,7 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
            mask_line += mask_stride;                                                           \
        }                                                                                       \
                                                                                                \
-       y = vy >> 16;                                                                           \
+       y = pixman_fixed_to_int (vy);                                                           \
        vy += unit_y;                                                                           \
        if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                              \
            repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);                                         \
@@ -470,18 +469,21 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
            src = src_first_line + src_stride * y;                                              \
            if (left_pad > 0)                                                                   \
            {                                                                                   \
-               scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);                       \
+               scanline_func (mask, dst,                                                       \
+                              src + src_image->bits.width - src_image->bits.width + 1,         \
+                              left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);           \
            }                                                                                   \
            if (width > 0)                                                                      \
            {                                                                                   \
                scanline_func (mask + (mask_is_solid ? 0 : left_pad),                           \
-                              dst + left_pad, src, width, vx, unit_x, 0, FALSE);               \
+                              dst + left_pad, src + src_image->bits.width, width,              \
+                              vx - src_width_fixed, unit_x, src_width_fixed, FALSE);           \
            }                                                                                   \
            if (right_pad > 0)                                                                  \
            {                                                                                   \
                scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),                   \
-                              dst + left_pad + width, src + src_image->bits.width - 1,         \
-                              right_pad, 0, 0, 0, FALSE);                                      \
+                              dst + left_pad + width, src + src_image->bits.width,             \
+                              right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);          \
            }                                                                                   \
        }                                                                                       \
        else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)                           \
@@ -489,29 +491,34 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
            static const src_type_t zero[1] = { 0 };                                            \
            if (y < 0 || y >= src_image->bits.height)                                           \
            {                                                                                   \
-               scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);   \
+               scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,               \
+                              -pixman_fixed_e, 0, src_width_fixed, TRUE);                      \
                continue;                                                                       \
            }                                                                                   \
            src = src_first_line + src_stride * y;                                              \
            if (left_pad > 0)                                                                   \
            {                                                                                   \
-               scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);                       \
+               scanline_func (mask, dst, zero + 1, left_pad,                                   \
+                              -pixman_fixed_e, 0, src_width_fixed, TRUE);                      \
            }                                                                                   \
            if (width > 0)                                                                      \
            {                                                                                   \
                scanline_func (mask + (mask_is_solid ? 0 : left_pad),                           \
-                              dst + left_pad, src, width, vx, unit_x, 0, FALSE);               \
+                              dst + left_pad, src + src_image->bits.width, width,              \
+                              vx - src_width_fixed, unit_x, src_width_fixed, FALSE);           \
            }                                                                                   \
            if (right_pad > 0)                                                                  \
            {                                                                                   \
                scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),                   \
-                              dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);         \
+                              dst + left_pad + width, zero + 1, right_pad,                     \
+                              -pixman_fixed_e, 0, src_width_fixed, TRUE);                      \
            }                                                                                   \
        }                                                                                       \
        else                                                                                    \
        {                                                                                       \
            src = src_first_line + src_stride * y;                                              \
-           scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);                   \
+           scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed, \
+                          unit_x, src_width_fixed, FALSE);                                     \
        }                                                                                       \
     }                                                                                          \
 }
index cf21ef8..efed310 100644 (file)
@@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                              int32_t         w,
                                              pixman_fixed_t  vx,
                                              pixman_fixed_t  unit_x,
-                                             pixman_fixed_t  max_vx,
+                                             pixman_fixed_t  src_width_fixed,
                                              pixman_bool_t   fully_transparent_src)
 {
     uint32_t s, d;
@@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
     while (w && ((unsigned long)pd & 15))
     {
        d = *pd;
-       s = combine1 (ps + (vx >> 16), pm);
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
        vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
 
        *pd++ = core_combine_over_u_pixel_sse2 (s, d);
        if (pm)
@@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
        __m128i tmp;
        uint32_t tmp1, tmp2, tmp3, tmp4;
 
-       tmp1 = ps[vx >> 16];
+       tmp1 = *(ps + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp2 = ps[vx >> 16];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp2 = *(ps + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp3 = ps[vx >> 16];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp3 = *(ps + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp4 = ps[vx >> 16];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp4 = *(ps + pixman_fixed_to_int (vx));
        vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
 
        tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
 
@@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
     while (w)
     {
        d = *pd;
-       s = combine1 (ps + (vx >> 16), pm);
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
        vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
 
        *pd++ = core_combine_over_u_pixel_sse2 (s, d);
        if (pm)
@@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
                       scaled_nearest_scanline_sse2_8888_8888_OVER,
                       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, NORMAL)
 
 static force_inline void
 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
@@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
                                               int32_t          w,
                                               pixman_fixed_t   vx,
                                               pixman_fixed_t   unit_x,
-                                              pixman_fixed_t   max_vx,
+                                              pixman_fixed_t   src_width_fixed,
                                               pixman_bool_t    zero_src)
 {
     __m128i xmm_mask;
@@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 
     while (w && (unsigned long)dst & 15)
     {
-       uint32_t s = src[pixman_fixed_to_int (vx)];
+       uint32_t s = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
 
        if (s)
        {
@@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
     {
        uint32_t tmp1, tmp2, tmp3, tmp4;
 
-       tmp1 = src[pixman_fixed_to_int (vx)];
+       tmp1 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp2 = src[pixman_fixed_to_int (vx)];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp2 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp3 = src[pixman_fixed_to_int (vx)];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp3 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
-       tmp4 = src[pixman_fixed_to_int (vx)];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp4 = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
 
        xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
 
@@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
 
     while (w)
     {
-       uint32_t s = src[pixman_fixed_to_int (vx)];
+       uint32_t s = *(src + pixman_fixed_to_int (vx));
        vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
 
        if (s)
        {
@@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
                              scaled_nearest_scanline_sse2_8888_n_8888_OVER,
                              uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
 
@@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
 
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),