From 1b2ff7eff7e97d8542e35514bfa010be706973ee Mon Sep 17 00:00:00 2001 From: Neil Roberts Date: Fri, 18 Dec 2009 21:17:21 +0000 Subject: [PATCH] cogl: Use SSE2 when possible for premultiplying This adds a fast path for premultiplying an RGBA image using SSE2 instructions. SSE registers are 128-bit and we need at least 16-bits per component for the intermediate result of the multiplication so we can do two pixels in parallel with one register. The function interleaves 2 SSE registers to multiply 4 pixels in one function call with the hope that this will pipeline better. http://bugzilla.openedhand.com/show_bug.cgi?id=1939 Signed-off-by: Emmanuele Bassi --- clutter/cogl/cogl/cogl-bitmap-fallback.c | 104 ++++++++++++++++++++++++++++++- 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/clutter/cogl/cogl/cogl-bitmap-fallback.c b/clutter/cogl/cogl/cogl-bitmap-fallback.c index a02d856..eee42e5 100644 --- a/clutter/cogl/cogl/cogl-bitmap-fallback.c +++ b/clutter/cogl/cogl/cogl-bitmap-fallback.c @@ -215,6 +215,91 @@ _cogl_premult_alpha_first (guchar *dst) #undef MULT +/* Use the SSE optimized version to premult four pixels at once when + it is available. The same assembler code works for x86 and x86-64 + because it doesn't refer to any non-SSE registers directly */ +#if defined(__SSE2__) && defined(__GNUC__) \ + && (defined(__x86_64) || defined(__i386)) +#define COGL_USE_PREMULT_SSE2 +#endif + +#ifdef COGL_USE_PREMULT_SSE2 + +inline static void +_cogl_premult_alpha_last_four_pixels_sse2 (const guint8 *p) +{ + /* 8 copies of 128 used below */ + static const gint16 eight_halves[8] __attribute__ ((aligned (16))) = + { 128, 128, 128, 128, 128, 128, 128, 128 }; + /* Mask of the rgb components of the four pixels */ + static const gint8 just_rgb[16] __attribute__ ((aligned (16))) = + { 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, + 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 }; + /* Each SSE register only holds two pixels because we need to work + with 16-bit intermediate values. We still do four pixels by + interleaving two registers in the hope that it will pipeline + better */ + asm (/* Load eight_halves into xmm5 for later */ + "movdqa (%1), %%xmm5\n" + /* Clear xmm3 */ + "pxor %%xmm3, %%xmm3\n" + /* Load two pixels from p into the low half of xmm0 */ + "movlps (%0), %%xmm0\n" + /* Load the next set of two pixels from p into the low half of xmm1 */ + "movlps 8(%0), %%xmm1\n" + /* Unpack 8 bytes from the low quad-words in each register to 8 + 16-bit values */ + "punpcklbw %%xmm3, %%xmm0\n" + "punpcklbw %%xmm3, %%xmm1\n" + /* Copy alpha values of the first pixel in xmm0 to all + components of the first pixel in xmm2 */ + "pshuflw $255, %%xmm0, %%xmm2\n" + /* same for xmm1 and xmm3 */ + "pshuflw $255, %%xmm1, %%xmm3\n" + /* The above also copies the second pixel directly so we now + want to replace the RGB components with copies of the alpha + components */ + "pshufhw $255, %%xmm2, %%xmm2\n" + "pshufhw $255, %%xmm3, %%xmm3\n" + /* Multiply the rgb components by the alpha */ + "pmullw %%xmm2, %%xmm0\n" + "pmullw %%xmm3, %%xmm1\n" + /* Add 128 to each component */ + "paddw %%xmm5, %%xmm0\n" + "paddw %%xmm5, %%xmm1\n" + /* Copy the results to temporary registers xmm4 and xmm5 */ + "movdqa %%xmm0, %%xmm4\n" + "movdqa %%xmm1, %%xmm5\n" + /* Divide the results by 256 */ + "psrlw $8, %%xmm0\n" + "psrlw $8, %%xmm1\n" + /* Add the temporaries back in */ + "paddw %%xmm4, %%xmm0\n" + "paddw %%xmm5, %%xmm1\n" + /* Divide again */ + "psrlw $8, %%xmm0\n" + "psrlw $8, %%xmm1\n" + /* Pack the results back as bytes */ + "packuswb %%xmm1, %%xmm0\n" + /* Load just_rgb into xmm3 for later */ + "movdqa (%2), %%xmm3\n" + /* Reload all four pixels into xmm2 */ + "movups (%0), %%xmm2\n" + /* Mask out the alpha from the results */ + "andps %%xmm3, %%xmm0\n" + /* Mask out the RGB from the original four pixels */ + "andnps %%xmm2, %%xmm3\n" + /* Combine the two to get the right alpha values */ + "orps %%xmm3, %%xmm0\n" + /* Write to memory */ + "movdqu %%xmm0, (%0)\n" + : /* no outputs */ + : "r" (p), "r" (eight_halves), "r" (just_rgb) + : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} + +#endif /* COGL_USE_PREMULT_SSE2 */ + gboolean _cogl_bitmap_fallback_can_convert (CoglPixelFormat src, CoglPixelFormat dst) { @@ -408,7 +493,24 @@ _cogl_bitmap_fallback_premult (CoglBitmap *bmp) } else { - for (x = 0; x < bmp->width; x++) + x = bmp->width; + +#ifdef COGL_USE_PREMULT_SSE2 + + /* Process 4 pixels at a time */ + while (x >= 4) + { + _cogl_premult_alpha_last_four_pixels_sse2 (p); + p += 4 * 4; + x -= 4; + } + + /* If there are any pixels left we will fall through and + handle them below */ + +#endif /* COGL_USE_PREMULT_SSE2 */ + + while (x-- > 0) { _cogl_premult_alpha_last (p); p += 4; -- 2.7.4