From 1b2ff7eff7e97d8542e35514bfa010be706973ee Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Fri, 18 Dec 2009 21:17:21 +0000
Subject: [PATCH] cogl: Use SSE2 when possible for premultiplying

This adds a fast path for premultiplying an RGBA image using SSE2
instructions. SSE registers are 128-bit and we need at least 16-bits
per component for the intermediate result of the multiplication so we
can do two pixels in parallel with one register. The function
interleaves 2 SSE registers to multiply 4 pixels in one function call
with the hope that this will pipeline better.

http://bugzilla.openedhand.com/show_bug.cgi?id=1939
Signed-off-by: Emmanuele Bassi <ebassi@linux.intel.com>
---
 clutter/cogl/cogl/cogl-bitmap-fallback.c | 104 ++++++++++++++++++++++++++++++-
 1 file changed, 103 insertions(+), 1 deletion(-)

diff --git a/clutter/cogl/cogl/cogl-bitmap-fallback.c b/clutter/cogl/cogl/cogl-bitmap-fallback.c
index a02d856..eee42e5 100644
--- a/clutter/cogl/cogl/cogl-bitmap-fallback.c
+++ b/clutter/cogl/cogl/cogl-bitmap-fallback.c
@@ -215,6 +215,91 @@ _cogl_premult_alpha_first (guchar *dst)
 
 #undef MULT
 
+/* Use the SSE optimized version to premult four pixels at once when
+   it is available. The same assembler code works for x86 and x86-64
+   because it doesn't refer to any non-SSE registers directly */
+#if defined(__SSE2__) && defined(__GNUC__) \
+  && (defined(__x86_64) || defined(__i386))
+#define COGL_USE_PREMULT_SSE2
+#endif
+
+#ifdef COGL_USE_PREMULT_SSE2
+
+inline static void
+_cogl_premult_alpha_last_four_pixels_sse2 (const guint8 *p)
+{
+  /* 8 copies of 128 used below */
+  static const gint16 eight_halves[8] __attribute__ ((aligned (16))) =
+    { 128, 128, 128, 128, 128, 128, 128, 128 };
+  /* Mask of the rgb components of the four pixels */
+  static const gint8 just_rgb[16] __attribute__ ((aligned (16))) =
+    { 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
+      0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 };
+  /* Each SSE register only holds two pixels because we need to work
+     with 16-bit intermediate values. We still do four pixels by
+     interleaving two registers in the hope that it will pipeline
+     better */
+  asm (/* Load eight_halves into xmm5 for later */
+       "movdqa (%1), %%xmm5\n"
+       /* Clear xmm3 */
+       "pxor %%xmm3, %%xmm3\n"
+       /* Load two pixels from p into the low half of xmm0 */
+       "movlps (%0), %%xmm0\n"
+       /* Load the next set of two pixels from p into the low half of xmm1 */
+       "movlps 8(%0), %%xmm1\n"
+       /* Unpack 8 bytes from the low quad-words in each register to 8
+          16-bit values */
+       "punpcklbw %%xmm3, %%xmm0\n"
+       "punpcklbw %%xmm3, %%xmm1\n"
+       /* Copy alpha values of the first pixel in xmm0 to all
+          components of the first pixel in xmm2 */
+       "pshuflw $255, %%xmm0, %%xmm2\n"
+       /* same for xmm1 and xmm3 */
+       "pshuflw $255, %%xmm1, %%xmm3\n"
+       /* The above also copies the second pixel directly so we now
+          want to replace the RGB components with copies of the alpha
+          components */
+       "pshufhw $255, %%xmm2, %%xmm2\n"
+       "pshufhw $255, %%xmm3, %%xmm3\n"
+       /* Multiply the rgb components by the alpha */
+       "pmullw %%xmm2, %%xmm0\n"
+       "pmullw %%xmm3, %%xmm1\n"
+       /* Add 128 to each component */
+       "paddw %%xmm5, %%xmm0\n"
+       "paddw %%xmm5, %%xmm1\n"
+       /* Copy the results to temporary registers xmm4 and xmm5 */
+       "movdqa %%xmm0, %%xmm4\n"
+       "movdqa %%xmm1, %%xmm5\n"
+       /* Divide the results by 256 */
+       "psrlw $8, %%xmm0\n"
+       "psrlw $8, %%xmm1\n"
+       /* Add the temporaries back in */
+       "paddw %%xmm4, %%xmm0\n"
+       "paddw %%xmm5, %%xmm1\n"
+       /* Divide again */
+       "psrlw $8, %%xmm0\n"
+       "psrlw $8, %%xmm1\n"
+       /* Pack the results back as bytes */
+       "packuswb %%xmm1, %%xmm0\n"
+       /* Load just_rgb into xmm3 for later */
+       "movdqa (%2), %%xmm3\n"
+       /* Reload all four pixels into xmm2 */
+       "movups (%0), %%xmm2\n"
+       /* Mask out the alpha from the results */
+       "andps %%xmm3, %%xmm0\n"
+       /* Mask out the RGB from the original four pixels */
+       "andnps %%xmm2, %%xmm3\n"
+       /* Combine the two to get the right alpha values */
+       "orps %%xmm3, %%xmm0\n"
+       /* Write to memory */
+       "movdqu %%xmm0, (%0)\n"
+       : /* no outputs */
+       : "r" (p), "r" (eight_halves), "r" (just_rgb)
+       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
+#endif /* COGL_USE_PREMULT_SSE2 */
+
 gboolean
 _cogl_bitmap_fallback_can_convert (CoglPixelFormat src, CoglPixelFormat dst)
 {
@@ -408,7 +493,24 @@ _cogl_bitmap_fallback_premult (CoglBitmap *bmp)
         }
       else
         {
-          for (x = 0; x < bmp->width; x++)
+          x = bmp->width;
+
+#ifdef COGL_USE_PREMULT_SSE2
+
+          /* Process 4 pixels at a time */
+          while (x >= 4)
+            {
+              _cogl_premult_alpha_last_four_pixels_sse2 (p);
+              p += 4 * 4;
+              x -= 4;
+            }
+
+          /* If there are any pixels left we will fall through and
+             handle them below */
+
+#endif /* COGL_USE_PREMULT_SSE2 */
+
+          while (x-- > 0)
             {
               _cogl_premult_alpha_last (p);
               p += 4;
-- 
2.7.4