Add sse2 version of add_n_8888_8888()
authorSøren Sandmann Pedersen <sandmann@redhat.com>
Fri, 31 Jul 2009 14:39:41 +0000 (10:39 -0400)
committerSøren Sandmann Pedersen <sandmann@redhat.com>
Sat, 8 Aug 2009 20:52:38 +0000 (16:52 -0400)
pixman/pixman-sse2.c

index 2fa956e..e2a39a8 100644 (file)
@@ -3098,6 +3098,145 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
     _mm_empty ();
 }
 
+/* ------------------------------
+ * composite_add_n_8888_8888_ca
+ */
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  pixman_image_t *         src_image,
+                                  pixman_image_t *         mask_image,
+                                  pixman_image_t *         dst_image,
+                                  int32_t                  src_x,
+                                  int32_t                  src_y,
+                                  int32_t                  mask_x,
+                                  int32_t                  mask_y,
+                                  int32_t                  dest_x,
+                                  int32_t                  dest_y,
+                                  int32_t                  width,
+                                  int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    srca = src >> 24;
+    
+    if (src == 0)
+       return;
+
+    PIXMAN_IMAGE_GET_LINE (
+       dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+       mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+       create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = _mm_movepi64_pi64 (xmm_src);
+    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
+
+    while (height--)
+    {
+       int w = width;
+       const uint32_t *pm = (uint32_t *)mask_line;
+       uint32_t *pd = (uint32_t *)dst_line;
+
+       dst_line += dst_stride;
+       mask_line += mask_stride;
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)pd);
+       cache_prefetch ((__m128i*)pm);
+
+       while (w && (unsigned long)pd & 15)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               
+               mmx_mask = unpack_32_1x64 (m);
+               mmx_dest = unpack_32_1x64 (d);
+
+               *pd = pack_1x64_32 (
+                   _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+
+       /* call prefetch hint to optimize cache load*/
+       cache_prefetch ((__m128i*)pd);
+       cache_prefetch ((__m128i*)pm);
+
+       while (w >= 4)
+       {
+           /* fill cache line with next memory */
+           cache_prefetch_next ((__m128i*)pd);
+           cache_prefetch_next ((__m128i*)pm);
+
+           xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+           pack_cmp =
+               _mm_movemask_epi8 (
+                   _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+           /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+           if (pack_cmp != 0xffff)
+           {
+               xmm_dst = load_128_aligned ((__m128i*)pd);
+
+               unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+               pix_multiply_2x128 (&xmm_src, &xmm_src,
+                                   &xmm_mask_lo, &xmm_mask_hi,
+                                   &xmm_mask_lo, &xmm_mask_hi);
+               xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+               
+               save_128_aligned (
+                   (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+           }
+
+           pd += 4;
+           pm += 4;
+           w -= 4;
+       }
+
+       while (w)
+       {
+           m = *pm++;
+
+           if (m)
+           {
+               d = *pd;
+               
+               mmx_mask = unpack_32_1x64 (m);
+               mmx_dest = unpack_32_1x64 (d);
+
+               *pd = pack_1x64_32 (
+                   _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+           }
+
+           pd++;
+           w--;
+       }
+    }
+
+    _mm_empty ();
+}
+
 /* ---------------------------------------------------------------------------
  * composite_over_n_8888_8888_ca
  */
@@ -5502,6 +5641,7 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
     { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
 
+    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
     { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
     { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
     { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },