Add scaled nearest repeat fast paths

author Siarhei Siamashka <siarhei.siamashka@gmail.com>

Tue, 26 Jun 2012 02:36:52 +0000 (22:36 -0400)

committer Søren Sandmann Pedersen <ssp@redhat.com>

Wed, 26 Sep 2012 04:03:10 +0000 (00:03 -0400)
author Siarhei Siamashka <siarhei.siamashka@gmail.com>
Tue, 26 Jun 2012 02:36:52 +0000 (22:36 -0400)
committer Søren Sandmann Pedersen <ssp@redhat.com>
Wed, 26 Sep 2012 04:03:10 +0000 (00:03 -0400)
diff --git a/pixman/pixman-arm-common.h b/pixman/pixman-arm-common.h

index f56264e..3a7cb2b 100644 (file)
--- a/pixman/pixman-arm-common.h
+++ b/pixman/pixman-arm-common.h
@@ -236,7 +236,8 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                     dst_type *       dst,      \
                                                     const src_type * src,      \
                                                     pixman_fixed_t   vx,       \
-                                                   pixman_fixed_t   unit_x);  \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx);  \
                                                                                \
  static force_inline void                                                      \
  scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
@@ -248,7 +249,8 @@ scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
                                                     pixman_bool_t    zero_src) \
  {                                                                             \
      pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
-                                                                  vx, unit_x);\
+                                                                  vx, unit_x, \
+                                                                  max_vx);    \
  }                                                                             \
                                                                                \
  FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
@@ -259,13 +261,17 @@ FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
                         src_type, dst_type, NONE)                              \
  FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
                         scaled_nearest_scanline_##cputype##_##name##_##op,     \
-                       src_type, dst_type, PAD)
+                       src_type, dst_type, PAD)                               \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NORMAL)
  
  /* Provide entries for the fast path table */
  #define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
      SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
      SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
-    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
  
  #define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
                                                    src_type, dst_type)         \
@@ -276,6 +282,7 @@ pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
                                                     const src_type * src,      \
                                                     pixman_fixed_t   vx,       \
                                                     pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
                                                     const uint8_t *  mask);    \
                                                                                \
  static force_inline void                                                      \
@@ -292,6 +299,7 @@ scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
         return;                                                               \
      pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
                                                                    vx, unit_x, \
+                                                                  max_vx,     \
                                                                    mask);      \
  }                                                                             \
                                                                                \
@@ -303,13 +311,17 @@ FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
                                src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
  FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
                                scaled_nearest_scanline_##cputype##_##name##_##op,\
-                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
  
  /* Provide entries for the fast path table */
  #define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
      SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
      SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
-    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
  
  /*****************************************************************************/
  
diff --git a/pixman/pixman-arm-neon-asm.h b/pixman/pixman-arm-neon-asm.h

index 97adc6a..1673b08 100644 (file)
--- a/pixman/pixman-arm-neon-asm.h
+++ b/pixman/pixman-arm-neon-asm.h
@@ -212,27 +212,39 @@
  .macro pixld1_s elem_size, reg1, mem_operand
  .if elem_size == 16
      mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP1, mem_operand, TMP1, asl #1
      mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP2, mem_operand, TMP2, asl #1
      vld1.16 {d&reg1&[0]}, [TMP1, :16]
      mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP1, mem_operand, TMP1, asl #1
      vld1.16 {d&reg1&[1]}, [TMP2, :16]
      mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP2, mem_operand, TMP2, asl #1
      vld1.16 {d&reg1&[2]}, [TMP1, :16]
      vld1.16 {d&reg1&[3]}, [TMP2, :16]
  .elseif elem_size == 32
      mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP1, mem_operand, TMP1, asl #2
      mov     TMP2, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP2, mem_operand, TMP2, asl #2
      vld1.32 {d&reg1&[0]}, [TMP1, :32]
      vld1.32 {d&reg1&[1]}, [TMP2, :32]
@@ -242,7 +254,7 @@
  .endm
  
  .macro pixld2_s elem_size, reg1, reg2, mem_operand
-.if elem_size == 32
+.if 0 /* elem_size == 32 */
      mov     TMP1, VX, asr #16
      add     VX, VX, UNIT_X, asl #1
      add     TMP1, mem_operand, TMP1, asl #2
@@ -268,12 +280,16 @@
  .macro pixld0_s elem_size, reg1, idx, mem_operand
  .if elem_size == 16
      mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP1, mem_operand, TMP1, asl #1
      vld1.16 {d&reg1&[idx]}, [TMP1, :16]
  .elseif elem_size == 32
      mov     TMP1, VX, asr #16
-    add     VX, VX, UNIT_X
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
      add     TMP1, mem_operand, TMP1, asl #2
      vld1.32 {d&reg1&[idx]}, [TMP1, :32]
  .endif
@@ -964,15 +980,17 @@ fname:
      TMP1        .req        r4
      TMP2        .req        r5
      DST_R       .req        r6
+    SRC_WIDTH_FIXED .req        r7
  
      .macro pixld_src x:vararg
          pixld_s x
      .endm
  
      ldr         UNIT_X, [sp]
-    push        {r4-r6, lr}
+    push        {r4-r8, lr}
+    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
      .if mask_bpp != 0
-    ldr         MASK, [sp, #(16 + 4)]
+    ldr         MASK, [sp, #(24 + 8)]
      .endif
  .else
      /*
@@ -1044,7 +1062,7 @@ fname:
  
      cleanup
  .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
  .else
      bx          lr  /* exit */
  .endif
@@ -1058,7 +1076,7 @@ fname:
      cleanup
  
  .if use_nearest_scaling != 0
-    pop         {r4-r6, pc}  /* exit */
+    pop         {r4-r8, pc}  /* exit */
  
      .unreq      DST_R
      .unreq      SRC
@@ -1069,6 +1087,7 @@ fname:
      .unreq      TMP2
      .unreq      DST_W
      .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
  
  .else
      bx          lr  /* exit */
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S

index 8fe1b50..b438001 100644 (file)
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -355,49 +355,57 @@ pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
                                        prefetch_braking_distance
  
  pixman_asm_function fname
-       W       .req    r0
-       DST     .req    r1
-       SRC     .req    r2
-       VX      .req    r3
-       UNIT_X  .req    ip
-       TMP1    .req    r4
-       TMP2    .req    r5
-       VXMASK  .req    r6
-       PF_OFFS .req    r7
+       W               .req    r0
+       DST             .req    r1
+       SRC             .req    r2
+       VX              .req    r3
+       UNIT_X          .req    ip
+       TMP1            .req    r4
+       TMP2            .req    r5
+       VXMASK          .req    r6
+       PF_OFFS         .req    r7
+       SRC_WIDTH_FIXED .req    r8
  
         ldr     UNIT_X, [sp]
-       push    {r4, r5, r6, r7}
+       push    {r4, r5, r6, r7, r8, r10}
         mvn     VXMASK, #((1 << bpp_shift) - 1)
+       ldr     SRC_WIDTH_FIXED, [sp, #28]
  
         /* define helper macro */
         .macro  scale_2_pixels
                 ldr&t   TMP1, [SRC, TMP1]
-               and     TMP2, VXMASK, VX, lsr #(16 - bpp_shift)
-               add     VX, VX, UNIT_X
+               and     TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+               adds    VX, VX, UNIT_X
                 str&t   TMP1, [DST], #(1 << bpp_shift)
+9:             subpls  VX, VX, SRC_WIDTH_FIXED
+               bpl     9b
  
                 ldr&t   TMP2, [SRC, TMP2]
-               and     TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-               add     VX, VX, UNIT_X
+               and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+               adds    VX, VX, UNIT_X
                 str&t   TMP2, [DST], #(1 << bpp_shift)
+9:             subpls  VX, VX, SRC_WIDTH_FIXED
+               bpl     9b
         .endm
  
         /* now do the scaling */
-       and     TMP1, VXMASK, VX, lsr #(16 - bpp_shift)
-       add     VX, VX, UNIT_X
+       and     TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+       adds    VX, VX, UNIT_X
+9:     subpls  VX, VX, SRC_WIDTH_FIXED
+       bpl     9b
         subs    W, W, #(8 + prefetch_braking_distance)
         blt     2f
         /* calculate prefetch offset */
         mov     PF_OFFS, #prefetch_distance
         mla     PF_OFFS, UNIT_X, PF_OFFS, VX
  1:     /* main loop, process 8 pixels per iteration with prefetch */
-       subs    W, W, #8
+       pld     [SRC, PF_OFFS, asr #(16 - bpp_shift)]
         add     PF_OFFS, UNIT_X, lsl #3
         scale_2_pixels
         scale_2_pixels
         scale_2_pixels
         scale_2_pixels
-       pld     [SRC, PF_OFFS, lsr #(16 - bpp_shift)]
+       subs    W, W, #8
         bge     1b
  2:
         subs    W, W, #(4 - 8 - prefetch_braking_distance)
@@ -426,8 +434,9 @@ pixman_asm_function fname
         .unreq  TMP2
         .unreq  VXMASK
         .unreq  PF_OFFS
+       .unreq  SRC_WIDTH_FIXED
         /* return */
-       pop     {r4, r5, r6, r7}
+       pop     {r4, r5, r6, r7, r8, r10}
         bx      lr
  .endfunc
  .endm
diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c

index 86ed821..22bfd30 100644 (file)
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -1415,13 +1415,13 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
      uint16_t tmp1, tmp2, tmp3, tmp4;
      while ((w -= 4) >= 0)
      {
-       tmp1 = src[pixman_fixed_to_int (vx)];
+       tmp1 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp2 = src[pixman_fixed_to_int (vx)];
+       tmp2 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp3 = src[pixman_fixed_to_int (vx)];
+       tmp3 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp4 = src[pixman_fixed_to_int (vx)];
+       tmp4 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
         *dst++ = tmp1;
         *dst++ = tmp2;
@@ -1430,15 +1430,15 @@ scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
      }
      if (w & 2)
      {
-       tmp1 = src[pixman_fixed_to_int (vx)];
+       tmp1 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp2 = src[pixman_fixed_to_int (vx)];
+       tmp2 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
         *dst++ = tmp1;
         *dst++ = tmp2;
      }
      if (w & 1)
-       *dst++ = src[pixman_fixed_to_int (vx)];
+       *dst = *(src + pixman_fixed_to_int (vx));
  }
  
  FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
diff --git a/pixman/pixman-inlines.h b/pixman/pixman-inlines.h

index 3a3c658..7f2e404 100644 (file)
--- a/pixman/pixman-inlines.h
+++ b/pixman/pixman-inlines.h
@@ -271,7 +271,7 @@ scanline_func_name (dst_type_t       *dst,                                                  \
                     int32_t           w,                                                        \
                     pixman_fixed_t    vx,                                                       \
                     pixman_fixed_t    unit_x,                                                   \
-                   pixman_fixed_t    max_vx,                                                   \
+                   pixman_fixed_t    src_width_fixed,                                          \
                     pixman_bool_t     fully_transparent_src)                                    \
  {                                                                                              \
         uint32_t   d;                                                                           \
@@ -287,25 +287,25 @@ scanline_func_name (dst_type_t       *dst,                                                        \
                                                                                                 \
         while ((w -= 2) >= 0)                                                                   \
         {                                                                                       \
-           x1 = vx >> 16;                                                                      \
+           x1 = pixman_fixed_to_int (vx);                                                      \
             vx += unit_x;                                                                       \
             if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                          \
             {                                                                                   \
                 /* This works because we know that unit_x is positive */                        \
-               while (vx >= max_vx)                                                            \
-                   vx -= max_vx;                                                               \
+               while (vx >= 0)                                                                 \
+                   vx -= src_width_fixed;                                                      \
             }                                                                                   \
-           s1 = src[x1];                                                                       \
+           s1 = *(src + x1);                                                                   \
                                                                                                 \
-           x2 = vx >> 16;                                                                      \
+           x2 = pixman_fixed_to_int (vx);                                                      \
             vx += unit_x;                                                                       \
             if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                          \
             {                                                                                   \
                 /* This works because we know that unit_x is positive */                        \
-               while (vx >= max_vx)                                                            \
-                   vx -= max_vx;                                                               \
+               while (vx >= 0)                                                                 \
+                   vx -= src_width_fixed;                                                      \
             }                                                                                   \
-           s2 = src[x2];                                                                       \
+           s2 = *(src + x2);                                                                   \
                                                                                                 \
             if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)                                             \
             {                                                                                   \
@@ -349,8 +349,8 @@ scanline_func_name (dst_type_t       *dst,                                                  \
                                                                                                 \
         if (w & 1)                                                                              \
         {                                                                                       \
-           x1 = vx >> 16;                                                                      \
-           s1 = src[x1];                                                                       \
+           x1 = pixman_fixed_to_int (vx);                                                      \
+           s1 = *(src + x1);                                                                   \
                                                                                                 \
             if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)                                             \
             {                                                                                   \
@@ -388,7 +388,7 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
      mask_type_t *mask_line;                                                                    \
      src_type_t *src_first_line;                                                                        \
      int       y;                                                                               \
-    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */           \
+    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);              \
      pixman_fixed_t max_vy;                                                                     \
      pixman_vector_t v;                                                                         \
      pixman_fixed_t vx, vy;                                                                     \
@@ -434,11 +434,10 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
                                                                                                 \
      if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                                 \
      {                                                                                          \
-       /* Clamp repeating positions inside the actual samples */                               \
-       max_vx = src_image->bits.width << 16;                                                   \
-       max_vy = src_image->bits.height << 16;                                                  \
+       max_vy = pixman_int_to_fixed (src_image->bits.height);                                  \
                                                                                                 \
-       repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);                                             \
+       /* Clamp repeating positions inside the actual samples */                               \
+       repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);                                    \
         repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);                                             \
      }                                                                                          \
                                                                                                 \
@@ -460,7 +459,7 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
             mask_line += mask_stride;                                                           \
         }                                                                                       \
                                                                                                 \
-       y = vy >> 16;                                                                           \
+       y = pixman_fixed_to_int (vy);                                                           \
         vy += unit_y;                                                                           \
         if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)                              \
             repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);                                         \
@@ -470,18 +469,21 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
             src = src_first_line + src_stride * y;                                              \
             if (left_pad > 0)                                                                   \
             {                                                                                   \
-               scanline_func (mask, dst, src, left_pad, 0, 0, 0, FALSE);                       \
+               scanline_func (mask, dst,                                                       \
+                              src + src_image->bits.width - src_image->bits.width + 1,         \
+                              left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);           \
             }                                                                                   \
             if (width > 0)                                                                      \
             {                                                                                   \
                 scanline_func (mask + (mask_is_solid ? 0 : left_pad),                           \
-                              dst + left_pad, src, width, vx, unit_x, 0, FALSE);               \
+                              dst + left_pad, src + src_image->bits.width, width,              \
+                              vx - src_width_fixed, unit_x, src_width_fixed, FALSE);           \
             }                                                                                   \
             if (right_pad > 0)                                                                  \
             {                                                                                   \
                 scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),                   \
-                              dst + left_pad + width, src + src_image->bits.width - 1,         \
-                              right_pad, 0, 0, 0, FALSE);                                      \
+                              dst + left_pad + width, src + src_image->bits.width,             \
+                              right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);          \
             }                                                                                   \
         }                                                                                       \
         else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)                           \
@@ -489,29 +491,34 @@ fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,
             static const src_type_t zero[1] = { 0 };                                            \
             if (y < 0 || y >= src_image->bits.height)                                           \
             {                                                                                   \
-               scanline_func (mask, dst, zero, left_pad + width + right_pad, 0, 0, 0, TRUE);   \
+               scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,               \
+                              -pixman_fixed_e, 0, src_width_fixed, TRUE);                      \
                 continue;                                                                       \
             }                                                                                   \
             src = src_first_line + src_stride * y;                                              \
             if (left_pad > 0)                                                                   \
             {                                                                                   \
-               scanline_func (mask, dst, zero, left_pad, 0, 0, 0, TRUE);                       \
+               scanline_func (mask, dst, zero + 1, left_pad,                                   \
+                              -pixman_fixed_e, 0, src_width_fixed, TRUE);                      \
             }                                                                                   \
             if (width > 0)                                                                      \
             {                                                                                   \
                 scanline_func (mask + (mask_is_solid ? 0 : left_pad),                           \
-                              dst + left_pad, src, width, vx, unit_x, 0, FALSE);               \
+                              dst + left_pad, src + src_image->bits.width, width,              \
+                              vx - src_width_fixed, unit_x, src_width_fixed, FALSE);           \
             }                                                                                   \
             if (right_pad > 0)                                                                  \
             {                                                                                   \
                 scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),                   \
-                              dst + left_pad + width, zero, right_pad, 0, 0, 0, TRUE);         \
+                              dst + left_pad + width, zero + 1, right_pad,                     \
+                              -pixman_fixed_e, 0, src_width_fixed, TRUE);                      \
             }                                                                                   \
         }                                                                                       \
         else                                                                                    \
         {                                                                                       \
             src = src_first_line + src_stride * y;                                              \
-           scanline_func (mask, dst, src, width, vx, unit_x, max_vx, FALSE);                   \
+           scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed, \
+                          unit_x, src_width_fixed, FALSE);                                     \
         }                                                                                       \
      }                                                                                          \
  }
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c

index cf21ef8..efed310 100644 (file)
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5159,7 +5159,7 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
                                               int32_t         w,
                                               pixman_fixed_t  vx,
                                               pixman_fixed_t  unit_x,
-                                             pixman_fixed_t  max_vx,
+                                             pixman_fixed_t  src_width_fixed,
                                               pixman_bool_t   fully_transparent_src)
  {
      uint32_t s, d;
@@ -5176,8 +5176,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
      while (w && ((unsigned long)pd & 15))
      {
         d = *pd;
-       s = combine1 (ps + (vx >> 16), pm);
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
         vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
  
         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
         if (pm)
@@ -5190,14 +5192,22 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
         __m128i tmp;
         uint32_t tmp1, tmp2, tmp3, tmp4;
  
-       tmp1 = ps[vx >> 16];
+       tmp1 = *(ps + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp2 = ps[vx >> 16];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp2 = *(ps + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp3 = ps[vx >> 16];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp3 = *(ps + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp4 = ps[vx >> 16];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp4 = *(ps + pixman_fixed_to_int (vx));
         vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
  
         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
  
@@ -5235,8 +5245,10 @@ scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
      while (w)
      {
         d = *pd;
-       s = combine1 (ps + (vx >> 16), pm);
+       s = combine1 (ps + pixman_fixed_to_int (vx), pm);
         vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
  
         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
         if (pm)
@@ -5255,6 +5267,9 @@ FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
  FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
                        scaled_nearest_scanline_sse2_8888_8888_OVER,
                        uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+                      scaled_nearest_scanline_sse2_8888_8888_OVER,
+                      uint32_t, uint32_t, NORMAL)
  
  static force_inline void
  scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
@@ -5263,7 +5278,7 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
                                                int32_t          w,
                                                pixman_fixed_t   vx,
                                                pixman_fixed_t   unit_x,
-                                              pixman_fixed_t   max_vx,
+                                              pixman_fixed_t   src_width_fixed,
                                                pixman_bool_t    zero_src)
  {
      __m128i xmm_mask;
@@ -5278,8 +5293,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
  
      while (w && (unsigned long)dst & 15)
      {
-       uint32_t s = src[pixman_fixed_to_int (vx)];
+       uint32_t s = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
  
         if (s)
         {
@@ -5301,14 +5318,22 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
      {
         uint32_t tmp1, tmp2, tmp3, tmp4;
  
-       tmp1 = src[pixman_fixed_to_int (vx)];
+       tmp1 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp2 = src[pixman_fixed_to_int (vx)];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp2 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp3 = src[pixman_fixed_to_int (vx)];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp3 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
-       tmp4 = src[pixman_fixed_to_int (vx)];
+       while (vx >= 0)
+           vx -= src_width_fixed;
+       tmp4 = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
  
         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
  
@@ -5336,8 +5361,10 @@ scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
  
      while (w)
      {
-       uint32_t s = src[pixman_fixed_to_int (vx)];
+       uint32_t s = *(src + pixman_fixed_to_int (vx));
         vx += unit_x;
+       while (vx >= 0)
+           vx -= src_width_fixed;
  
         if (s)
         {
@@ -5367,6 +5394,9 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
  FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+                             scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+                             uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
  
  #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
  
@@ -5856,11 +5886,19 @@ static const pixman_fast_path_t sse2_fast_paths[] =
      SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
      SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
      SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  
      SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
      SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
      SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
      SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  
      SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
      SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
author	Siarhei Siamashka <siarhei.siamashka@gmail.com>
	Tue, 26 Jun 2012 02:36:52 +0000 (22:36 -0400)
committer	Søren Sandmann Pedersen <ssp@redhat.com>
	Wed, 26 Sep 2012 04:03:10 +0000 (00:03 -0400)
pixman/pixman-arm-common.h		patch \| blob \| history
pixman/pixman-arm-neon-asm.h		patch \| blob \| history
pixman/pixman-arm-simd-asm.S		patch \| blob \| history
pixman/pixman-fast-path.c		patch \| blob \| history
pixman/pixman-inlines.h		patch \| blob \| history
pixman/pixman-sse2.c		patch \| blob \| history