cogl/cogl-bitmap-conversion.c

   1 /*
   2  * Cogl
   3  *
   4  * An object oriented GL/GLES Abstraction/Utility Layer
   5  *
   6  * Copyright (C) 2007,2008,2009 Intel Corporation.
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  20  *
  21  *
  22  */
  23
  24 #ifdef HAVE_CONFIG_H
  25 #include "config.h"
  26 #endif
  27
  28 #include "cogl-private.h"
  29 #include "cogl-bitmap-private.h"
  30 #include "cogl-context-private.h"
  31
  32 #include <string.h>
  33
  34 #define component_type guint8
  35 /* We want to specially optimise the packing when we are converting
  36    to/from an 8-bit type so that it won't do anything. That way for
  37    example if we are just doing a swizzle conversion then the inner
  38    loop for the conversion will be really simple */
  39 #define UNPACK_BYTE(b) (b)
  40 #define PACK_BYTE(b) (b)
  41 #include "cogl-bitmap-packing.h"
  42 #undef PACK_BYTE
  43 #undef UNPACK_BYTE
  44 #undef component_type
  45
  46 #define component_type guint16
  47 #define UNPACK_BYTE(b) (((b) * 65535 + 127) / 255)
  48 #define PACK_BYTE(b) (((b) * 255 + 32767) / 65535)
  49 #include "cogl-bitmap-packing.h"
  50 #undef PACK_BYTE
  51 #undef UNPACK_BYTE
  52 #undef component_type
  53
  54 /* (Un)Premultiplication */
  55
  56 inline static void
  57 _cogl_unpremult_alpha_0 (guint8 *dst)
  58 {
  59   dst[0] = 0;
  60   dst[1] = 0;
  61   dst[2] = 0;
  62   dst[3] = 0;
  63 }
  64
  65 inline static void
  66 _cogl_unpremult_alpha_last (guint8 *dst)
  67 {
  68   guint8 alpha = dst[3];
  69
  70   dst[0] = (dst[0] * 255) / alpha;
  71   dst[1] = (dst[1] * 255) / alpha;
  72   dst[2] = (dst[2] * 255) / alpha;
  73 }
  74
  75 inline static void
  76 _cogl_unpremult_alpha_first (guint8 *dst)
  77 {
  78   guint8 alpha = dst[0];
  79
  80   dst[1] = (dst[1] * 255) / alpha;
  81   dst[2] = (dst[2] * 255) / alpha;
  82   dst[3] = (dst[3] * 255) / alpha;
  83 }
  84
  85 /* No division form of floor((c*a + 128)/255) (I first encountered
  86  * this in the RENDER implementation in the X server.) Being exact
  87  * is important for a == 255 - we want to get exactly c.
  88  */
  89 #define MULT(d,a,t)                             \
  90   G_STMT_START {                                \
  91     t = d * a + 128;                            \
  92     d = ((t >> 8) + t) >> 8;                    \
  93   } G_STMT_END
  94
  95 inline static void
  96 _cogl_premult_alpha_last (guint8 *dst)
  97 {
  98   guint8 alpha = dst[3];
  99   /* Using a separate temporary per component has given slightly better
 100    * code generation with GCC in the past; it shouldn't do any worse in
 101    * any case.
 102    */
 103   unsigned int t1, t2, t3;
 104   MULT(dst[0], alpha, t1);
 105   MULT(dst[1], alpha, t2);
 106   MULT(dst[2], alpha, t3);
 107 }
 108
 109 inline static void
 110 _cogl_premult_alpha_first (guint8 *dst)
 111 {
 112   guint8 alpha = dst[0];
 113   unsigned int t1, t2, t3;
 114
 115   MULT(dst[1], alpha, t1);
 116   MULT(dst[2], alpha, t2);
 117   MULT(dst[3], alpha, t3);
 118 }
 119
 120 #undef MULT
 121
 122 /* Use the SSE optimized version to premult four pixels at once when
 123    it is available. The same assembler code works for x86 and x86-64
 124    because it doesn't refer to any non-SSE registers directly */
 125 #if defined(__SSE2__) && defined(__GNUC__) \
 126   && (defined(__x86_64) || defined(__i386))
 127 #define COGL_USE_PREMULT_SSE2
 128 #endif
 129
 130 #ifdef COGL_USE_PREMULT_SSE2
 131
 132 inline static void
 133 _cogl_premult_alpha_last_four_pixels_sse2 (guint8 *p)
 134 {
 135   /* 8 copies of 128 used below */
 136   static const gint16 eight_halves[8] __attribute__ ((aligned (16))) =
 137     { 128, 128, 128, 128, 128, 128, 128, 128 };
 138   /* Mask of the rgb components of the four pixels */
 139   static const gint8 just_rgb[16] __attribute__ ((aligned (16))) =
 140     { 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
 141       0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 };
 142   /* Each SSE register only holds two pixels because we need to work
 143      with 16-bit intermediate values. We still do four pixels by
 144      interleaving two registers in the hope that it will pipeline
 145      better */
 146   asm (/* Load eight_halves into xmm5 for later */
 147        "movdqa (%1), %%xmm5\n"
 148        /* Clear xmm3 */
 149        "pxor %%xmm3, %%xmm3\n"
 150        /* Load two pixels from p into the low half of xmm0 */
 151        "movlps (%0), %%xmm0\n"
 152        /* Load the next set of two pixels from p into the low half of xmm1 */
 153        "movlps 8(%0), %%xmm1\n"
 154        /* Unpack 8 bytes from the low quad-words in each register to 8
 155           16-bit values */
 156        "punpcklbw %%xmm3, %%xmm0\n"
 157        "punpcklbw %%xmm3, %%xmm1\n"
 158        /* Copy alpha values of the first pixel in xmm0 to all
 159           components of the first pixel in xmm2 */
 160        "pshuflw $255, %%xmm0, %%xmm2\n"
 161        /* same for xmm1 and xmm3 */
 162        "pshuflw $255, %%xmm1, %%xmm3\n"
 163        /* The above also copies the second pixel directly so we now
 164           want to replace the RGB components with copies of the alpha
 165           components */
 166        "pshufhw $255, %%xmm2, %%xmm2\n"
 167        "pshufhw $255, %%xmm3, %%xmm3\n"
 168        /* Multiply the rgb components by the alpha */
 169        "pmullw %%xmm2, %%xmm0\n"
 170        "pmullw %%xmm3, %%xmm1\n"
 171        /* Add 128 to each component */
 172        "paddw %%xmm5, %%xmm0\n"
 173        "paddw %%xmm5, %%xmm1\n"
 174        /* Copy the results to temporary registers xmm4 and xmm5 */
 175        "movdqa %%xmm0, %%xmm4\n"
 176        "movdqa %%xmm1, %%xmm5\n"
 177        /* Divide the results by 256 */
 178        "psrlw $8, %%xmm0\n"
 179        "psrlw $8, %%xmm1\n"
 180        /* Add the temporaries back in */
 181        "paddw %%xmm4, %%xmm0\n"
 182        "paddw %%xmm5, %%xmm1\n"
 183        /* Divide again */
 184        "psrlw $8, %%xmm0\n"
 185        "psrlw $8, %%xmm1\n"
 186        /* Pack the results back as bytes */
 187        "packuswb %%xmm1, %%xmm0\n"
 188        /* Load just_rgb into xmm3 for later */
 189        "movdqa (%2), %%xmm3\n"
 190        /* Reload all four pixels into xmm2 */
 191        "movups (%0), %%xmm2\n"
 192        /* Mask out the alpha from the results */
 193        "andps %%xmm3, %%xmm0\n"
 194        /* Mask out the RGB from the original four pixels */
 195        "andnps %%xmm2, %%xmm3\n"
 196        /* Combine the two to get the right alpha values */
 197        "orps %%xmm3, %%xmm0\n"
 198        /* Write to memory */
 199        "movdqu %%xmm0, (%0)\n"
 200        : /* no outputs */
 201        : "r" (p), "r" (eight_halves), "r" (just_rgb)
 202        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 203 }
 204
 205 #endif /* COGL_USE_PREMULT_SSE2 */
 206
 207 static void
 208 _cogl_bitmap_premult_unpacked_span_guint8 (guint8 *data,
 209                                            int width)
 210 {
 211 #ifdef COGL_USE_PREMULT_SSE2
 212
 213   /* Process 4 pixels at a time */
 214   while (width >= 4)
 215     {
 216       _cogl_premult_alpha_last_four_pixels_sse2 (data);
 217       data += 4 * 4;
 218       width -= 4;
 219     }
 220
 221   /* If there are any pixels left we will fall through and
 222      handle them below */
 223
 224 #endif /* COGL_USE_PREMULT_SSE2 */
 225
 226   while (width-- > 0)
 227     {
 228       _cogl_premult_alpha_last (data);
 229       data += 4;
 230     }
 231 }
 232
 233 static void
 234 _cogl_bitmap_unpremult_unpacked_span_guint8 (guint8 *data,
 235                                              int width)
 236 {
 237   int x;
 238
 239   for (x = 0; x < width; x++)
 240     {
 241       if (data[3] == 0)
 242         _cogl_unpremult_alpha_0 (data);
 243       else
 244         _cogl_unpremult_alpha_last (data);
 245       data += 4;
 246     }
 247 }
 248
 249 static void
 250 _cogl_bitmap_unpremult_unpacked_span_guint16 (guint16 *data,
 251                                               int width)
 252 {
 253   while (width-- > 0)
 254     {
 255       guint16 alpha = data[3];
 256
 257       if (alpha == 0)
 258         memset (data, 0, sizeof (guint16) * 3);
 259       else
 260         {
 261           data[0] = (data[0] * 65535) / alpha;
 262           data[1] = (data[1] * 65535) / alpha;
 263           data[2] = (data[2] * 65535) / alpha;
 264         }
 265     }
 266 }
 267
 268 static void
 269 _cogl_bitmap_premult_unpacked_span_guint16 (guint16 *data,
 270                                             int width)
 271 {
 272   while (width-- > 0)
 273     {
 274       guint16 alpha = data[3];
 275
 276       data[0] = (data[0] * alpha) / 65535;
 277       data[1] = (data[1] * alpha) / 65535;
 278       data[2] = (data[2] * alpha) / 65535;
 279     }
 280 }
 281
 282 static gboolean
 283 _cogl_bitmap_can_fast_premult (CoglPixelFormat format)
 284 {
 285   switch (format & ~COGL_PREMULT_BIT)
 286     {
 287     case COGL_PIXEL_FORMAT_RGBA_8888:
 288     case COGL_PIXEL_FORMAT_BGRA_8888:
 289     case COGL_PIXEL_FORMAT_ARGB_8888:
 290     case COGL_PIXEL_FORMAT_ABGR_8888:
 291       return TRUE;
 292
 293     default:
 294       return FALSE;
 295     }
 296 }
 297
 298 static gboolean
 299 _cogl_bitmap_needs_short_temp_buffer (CoglPixelFormat format)
 300 {
 301   /* If the format is using more than 8 bits per component then we'll
 302      unpack into a 16-bit per component buffer instead of 8-bit so we
 303      won't lose as much precision. If we ever add support for formats
 304      with more than 16 bits for at least one of the components then we
 305      should probably do something else here, maybe convert to
 306      floats */
 307   switch (format)
 308     {
 309     case COGL_PIXEL_FORMAT_ANY:
 310     case COGL_PIXEL_FORMAT_YUV:
 311       g_assert_not_reached ();
 312
 313     case COGL_PIXEL_FORMAT_A_8:
 314     case COGL_PIXEL_FORMAT_RGB_565:
 315     case COGL_PIXEL_FORMAT_RGBA_4444:
 316     case COGL_PIXEL_FORMAT_RGBA_5551:
 317     case COGL_PIXEL_FORMAT_G_8:
 318     case COGL_PIXEL_FORMAT_RGB_888:
 319     case COGL_PIXEL_FORMAT_BGR_888:
 320     case COGL_PIXEL_FORMAT_RGBA_8888:
 321     case COGL_PIXEL_FORMAT_BGRA_8888:
 322     case COGL_PIXEL_FORMAT_ARGB_8888:
 323     case COGL_PIXEL_FORMAT_ABGR_8888:
 324     case COGL_PIXEL_FORMAT_RGBA_8888_PRE:
 325     case COGL_PIXEL_FORMAT_BGRA_8888_PRE:
 326     case COGL_PIXEL_FORMAT_ARGB_8888_PRE:
 327     case COGL_PIXEL_FORMAT_ABGR_8888_PRE:
 328     case COGL_PIXEL_FORMAT_RGBA_4444_PRE:
 329     case COGL_PIXEL_FORMAT_RGBA_5551_PRE:
 330       return FALSE;
 331
 332     case COGL_PIXEL_FORMAT_RGBA_1010102:
 333     case COGL_PIXEL_FORMAT_BGRA_1010102:
 334     case COGL_PIXEL_FORMAT_ARGB_2101010:
 335     case COGL_PIXEL_FORMAT_ABGR_2101010:
 336     case COGL_PIXEL_FORMAT_RGBA_1010102_PRE:
 337     case COGL_PIXEL_FORMAT_BGRA_1010102_PRE:
 338     case COGL_PIXEL_FORMAT_ARGB_2101010_PRE:
 339     case COGL_PIXEL_FORMAT_ABGR_2101010_PRE:
 340       return TRUE;
 341     }
 342
 343   g_assert_not_reached ();
 344 }
 345
 346 gboolean
 347 _cogl_bitmap_convert_into_bitmap (CoglBitmap *src_bmp,
 348                                   CoglBitmap *dst_bmp)
 349 {
 350   guint8          *src_data;
 351   guint8          *dst_data;
 352   guint8          *src;
 353   guint8          *dst;
 354   void            *tmp_row;
 355   int              src_rowstride;
 356   int              dst_rowstride;
 357   int              y;
 358   int              width, height;
 359   CoglPixelFormat  src_format;
 360   CoglPixelFormat  dst_format;
 361   gboolean         use_16;
 362   gboolean         need_premult;
 363
 364   src_format = cogl_bitmap_get_format (src_bmp);
 365   src_rowstride = cogl_bitmap_get_rowstride (src_bmp);
 366   dst_format = cogl_bitmap_get_format (dst_bmp);
 367   dst_rowstride = cogl_bitmap_get_rowstride (dst_bmp);
 368   width = cogl_bitmap_get_width (src_bmp);
 369   height = cogl_bitmap_get_height (src_bmp);
 370
 371   _COGL_RETURN_VAL_IF_FAIL (width == cogl_bitmap_get_width (dst_bmp), FALSE);
 372   _COGL_RETURN_VAL_IF_FAIL (height == cogl_bitmap_get_height (dst_bmp), FALSE);
 373
 374   need_premult
 375     = ((src_format & COGL_PREMULT_BIT) != (dst_format & COGL_PREMULT_BIT) &&
 376        src_format != COGL_PIXEL_FORMAT_A_8 &&
 377        dst_format != COGL_PIXEL_FORMAT_A_8 &&
 378        (src_format & dst_format & COGL_A_BIT));
 379
 380   /* If the base format is the same then we can just copy the bitmap
 381      instead */
 382   if ((src_format & ~COGL_PREMULT_BIT) == (dst_format & ~COGL_PREMULT_BIT) &&
 383       (!need_premult || _cogl_bitmap_can_fast_premult (dst_format)))
 384     {
 385       if (!_cogl_bitmap_copy_subregion (src_bmp, dst_bmp,
 386                                         0, 0, /* src_x / src_y */
 387                                         0, 0, /* dst_x / dst_y */
 388                                         width, height))
 389         return FALSE;
 390
 391       if (need_premult)
 392         {
 393           if ((dst_format & COGL_PREMULT_BIT))
 394             {
 395               if (!_cogl_bitmap_premult (dst_bmp))
 396                 return FALSE;
 397             }
 398           else
 399             {
 400               if (!_cogl_bitmap_unpremult (dst_bmp))
 401                 return FALSE;
 402             }
 403         }
 404
 405       return TRUE;
 406     }
 407
 408   src_data = _cogl_bitmap_map (src_bmp, COGL_BUFFER_ACCESS_READ, 0);
 409   if (src_data == NULL)
 410     return FALSE;
 411   dst_data = _cogl_bitmap_map (dst_bmp,
 412                                COGL_BUFFER_ACCESS_WRITE,
 413                                COGL_BUFFER_MAP_HINT_DISCARD);
 414   if (dst_data == NULL)
 415     {
 416       _cogl_bitmap_unmap (src_bmp);
 417       return FALSE;
 418     }
 419
 420   use_16 = _cogl_bitmap_needs_short_temp_buffer (dst_format);
 421
 422   /* Allocate a buffer to hold a temporary RGBA row */
 423   tmp_row = g_malloc (width *
 424                       (use_16 ? sizeof (guint16) : sizeof (guint8)) * 4);
 425
 426   /* FIXME: Optimize */
 427   for (y = 0; y < height; y++)
 428     {
 429       src = src_data + y * src_rowstride;
 430       dst = dst_data + y * dst_rowstride;
 431
 432       if (use_16)
 433         _cogl_unpack_guint16 (src_format, src, tmp_row, width);
 434       else
 435         _cogl_unpack_guint8 (src_format, src, tmp_row, width);
 436
 437       /* Handle premultiplication */
 438       if (need_premult)
 439         {
 440           if (dst_format & COGL_PREMULT_BIT)
 441             {
 442               if (use_16)
 443                 _cogl_bitmap_premult_unpacked_span_guint16 (tmp_row, width);
 444               else
 445                 _cogl_bitmap_premult_unpacked_span_guint8 (tmp_row, width);
 446             }
 447           else
 448             {
 449               if (use_16)
 450                 _cogl_bitmap_unpremult_unpacked_span_guint16 (tmp_row, width);
 451               else
 452                 _cogl_bitmap_unpremult_unpacked_span_guint8 (tmp_row, width);
 453             }
 454         }
 455
 456       if (use_16)
 457         _cogl_pack_guint16 (dst_format, tmp_row, dst, width);
 458       else
 459         _cogl_pack_guint8 (dst_format, tmp_row, dst, width);
 460     }
 461
 462   _cogl_bitmap_unmap (src_bmp);
 463   _cogl_bitmap_unmap (dst_bmp);
 464
 465   g_free (tmp_row);
 466
 467   return TRUE;
 468 }
 469
 470 CoglBitmap *
 471 _cogl_bitmap_convert (CoglBitmap *src_bmp,
 472                       CoglPixelFormat dst_format)
 473 {
 474   CoglBitmap *dst_bmp;
 475   int width, height;
 476
 477   _COGL_GET_CONTEXT (ctx, NULL);
 478
 479   width = cogl_bitmap_get_width (src_bmp);
 480   height = cogl_bitmap_get_height (src_bmp);
 481
 482   dst_bmp = _cogl_bitmap_new_with_malloc_buffer (ctx,
 483                                                  width, height,
 484                                                  dst_format);
 485
 486   if (!_cogl_bitmap_convert_into_bitmap (src_bmp, dst_bmp))
 487     {
 488       cogl_object_unref (dst_bmp);
 489       return NULL;
 490     }
 491
 492   return dst_bmp;
 493 }
 494
 495 gboolean
 496 _cogl_bitmap_unpremult (CoglBitmap *bmp)
 497 {
 498   guint8          *p, *data;
 499   guint16         *tmp_row;
 500   int              x,y;
 501   CoglPixelFormat  format;
 502   int              width, height;
 503   int              rowstride;
 504
 505   format = cogl_bitmap_get_format (bmp);
 506   width = cogl_bitmap_get_width (bmp);
 507   height = cogl_bitmap_get_height (bmp);
 508   rowstride = cogl_bitmap_get_rowstride (bmp);
 509
 510   if ((data = _cogl_bitmap_map (bmp,
 511                                 COGL_BUFFER_ACCESS_READ |
 512                                 COGL_BUFFER_ACCESS_WRITE,
 513                                 0)) == NULL)
 514     return FALSE;
 515
 516   /* If we can't directly unpremult the data inline then we'll
 517      allocate a temporary row and unpack the data. This assumes if we
 518      can fast premult then we can also fast unpremult */
 519   if (_cogl_bitmap_can_fast_premult (format))
 520     tmp_row = NULL;
 521   else
 522     tmp_row = g_malloc (sizeof (guint16) * 4 * width);
 523
 524   for (y = 0; y < height; y++)
 525     {
 526       p = (guint8*) data + y * rowstride;
 527
 528       if (tmp_row)
 529         {
 530           _cogl_unpack_guint16 (format, p, tmp_row, width);
 531           _cogl_bitmap_unpremult_unpacked_span_guint16 (tmp_row, width);
 532           _cogl_pack_guint16 (format, tmp_row, p, width);
 533         }
 534       else
 535         {
 536           if (format & COGL_AFIRST_BIT)
 537             {
 538               for (x = 0; x < width; x++)
 539                 {
 540                   if (p[0] == 0)
 541                     _cogl_unpremult_alpha_0 (p);
 542                   else
 543                     _cogl_unpremult_alpha_first (p);
 544                   p += 4;
 545                 }
 546             }
 547           else
 548             _cogl_bitmap_unpremult_unpacked_span_guint8 (p, width);
 549         }
 550     }
 551
 552   g_free (tmp_row);
 553
 554   _cogl_bitmap_unmap (bmp);
 555
 556   _cogl_bitmap_set_format (bmp, format & ~COGL_PREMULT_BIT);
 557
 558   return TRUE;
 559 }
 560
 561 gboolean
 562 _cogl_bitmap_premult (CoglBitmap *bmp)
 563 {
 564   guint8          *p, *data;
 565   guint16         *tmp_row;
 566   int              x,y;
 567   CoglPixelFormat  format;
 568   int              width, height;
 569   int              rowstride;
 570
 571   format = cogl_bitmap_get_format (bmp);
 572   width = cogl_bitmap_get_width (bmp);
 573   height = cogl_bitmap_get_height (bmp);
 574   rowstride = cogl_bitmap_get_rowstride (bmp);
 575
 576   if ((data = _cogl_bitmap_map (bmp,
 577                                 COGL_BUFFER_ACCESS_READ |
 578                                 COGL_BUFFER_ACCESS_WRITE,
 579                                 0)) == NULL)
 580     return FALSE;
 581
 582   /* If we can't directly premult the data inline then we'll allocate
 583      a temporary row and unpack the data. */
 584   if (_cogl_bitmap_can_fast_premult (format))
 585     tmp_row = NULL;
 586   else
 587     tmp_row = g_malloc (sizeof (guint16) * 4 * width);
 588
 589   for (y = 0; y < height; y++)
 590     {
 591       p = (guint8*) data + y * rowstride;
 592
 593       if (tmp_row)
 594         {
 595           _cogl_unpack_guint16 (format, p, tmp_row, width);
 596           _cogl_bitmap_premult_unpacked_span_guint16 (tmp_row, width);
 597           _cogl_pack_guint16 (format, tmp_row, p, width);
 598         }
 599       else
 600         {
 601           if (format & COGL_AFIRST_BIT)
 602             {
 603               for (x = 0; x < width; x++)
 604                 {
 605                   _cogl_premult_alpha_first (p);
 606                   p += 4;
 607                 }
 608             }
 609           else
 610             _cogl_bitmap_premult_unpacked_span_guint8 (p, width);
 611         }
 612     }
 613
 614   g_free (tmp_row);
 615
 616   _cogl_bitmap_unmap (bmp);
 617
 618   _cogl_bitmap_set_format (bmp, format | COGL_PREMULT_BIT);
 619
 620   return TRUE;
 621 }