intel/filter_sse2_intrinsics.c

   1
   2 /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
   3  *
   4  * Copyright (c) 2018 Cosmin Truta
   5  * Copyright (c) 2016-2017 Glenn Randers-Pehrson
   6  * Written by Mike Klein and Matt Sarett
   7  * Derived from arm/filter_neon_intrinsics.c
   8  *
   9  * This code is released under the libpng license.
  10  * For conditions of distribution and use, see the disclaimer
  11  * and license in png.h
  12  */
  13
  14 #include "../pngpriv.h"
  15
  16 #ifdef PNG_READ_SUPPORTED
  17
  18 #if PNG_INTEL_SSE_IMPLEMENTATION > 0
  19
  20 #include <immintrin.h>
  21
  22 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
  23  * They're positioned like this:
  24  *    prev:  c b
  25  *    row:   a d
  26  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
  27  * whichever of a, b, or c is closest to p=a+b-c.
  28  */
  29
  30 static __m128i load4(const void* p) {
  31    int tmp;
  32    memcpy(&tmp, p, sizeof(tmp));
  33    return _mm_cvtsi32_si128(tmp);
  34 }
  35
  36 static void store4(void* p, __m128i v) {
  37    int tmp = _mm_cvtsi128_si32(v);
  38    memcpy(p, &tmp, sizeof(int));
  39 }
  40
  41 static __m128i load3(const void* p) {
  42    png_uint_32 tmp = 0;
  43    memcpy(&tmp, p, 3);
  44    return _mm_cvtsi32_si128(tmp);
  45 }
  46
  47 static void store3(void* p, __m128i v) {
  48    int tmp = _mm_cvtsi128_si32(v);
  49    memcpy(p, &tmp, 3);
  50 }
  51
  52 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
  53    png_const_bytep prev)
  54 {
  55    /* The Sub filter predicts each pixel as the previous pixel, a.
  56     * There is no pixel to the left of the first pixel.  It's encoded directly.
  57     * That works with our main loop if we just say that left pixel was zero.
  58     */
  59    size_t rb;
  60
  61    __m128i a, d = _mm_setzero_si128();
  62
  63    png_debug(1, "in png_read_filter_row_sub3_sse2");
  64
  65    rb = row_info->rowbytes;
  66    while (rb >= 4) {
  67       a = d; d = load4(row);
  68       d = _mm_add_epi8(d, a);
  69       store3(row, d);
  70
  71       row += 3;
  72       rb  -= 3;
  73    }
  74    if (rb > 0) {
  75       a = d; d = load3(row);
  76       d = _mm_add_epi8(d, a);
  77       store3(row, d);
  78
  79       row += 3;
  80       rb  -= 3;
  81    }
  82    PNG_UNUSED(prev)
  83 }
  84
  85 void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
  86    png_const_bytep prev)
  87 {
  88    /* The Sub filter predicts each pixel as the previous pixel, a.
  89     * There is no pixel to the left of the first pixel.  It's encoded directly.
  90     * That works with our main loop if we just say that left pixel was zero.
  91     */
  92    size_t rb;
  93
  94    __m128i a, d = _mm_setzero_si128();
  95
  96    png_debug(1, "in png_read_filter_row_sub4_sse2");
  97
  98    rb = row_info->rowbytes+4;
  99    while (rb > 4) {
 100       a = d; d = load4(row);
 101       d = _mm_add_epi8(d, a);
 102       store4(row, d);
 103
 104       row += 4;
 105       rb  -= 4;
 106    }
 107    PNG_UNUSED(prev)
 108 }
 109
 110 void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
 111    png_const_bytep prev)
 112 {
 113    /* The Avg filter predicts each pixel as the (truncated) average of a and b.
 114     * There's no pixel to the left of the first pixel.  Luckily, it's
 115     * predicted to be half of the pixel above it.  So again, this works
 116     * perfectly with our loop if we make sure a starts at zero.
 117     */
 118
 119    size_t rb;
 120
 121    const __m128i zero = _mm_setzero_si128();
 122
 123    __m128i    b;
 124    __m128i a, d = zero;
 125
 126    png_debug(1, "in png_read_filter_row_avg3_sse2");
 127    rb = row_info->rowbytes;
 128    while (rb >= 4) {
 129       __m128i avg;
 130              b = load4(prev);
 131       a = d; d = load4(row );
 132
 133       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
 134       avg = _mm_avg_epu8(a,b);
 135       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
 136       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
 137                                             _mm_set1_epi8(1)));
 138       d = _mm_add_epi8(d, avg);
 139       store3(row, d);
 140
 141       prev += 3;
 142       row  += 3;
 143       rb   -= 3;
 144    }
 145    if (rb > 0) {
 146       __m128i avg;
 147              b = load3(prev);
 148       a = d; d = load3(row );
 149
 150       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
 151       avg = _mm_avg_epu8(a,b);
 152       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
 153       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
 154                                             _mm_set1_epi8(1)));
 155
 156       d = _mm_add_epi8(d, avg);
 157       store3(row, d);
 158
 159       prev += 3;
 160       row  += 3;
 161       rb   -= 3;
 162    }
 163 }
 164
 165 void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
 166    png_const_bytep prev)
 167 {
 168    /* The Avg filter predicts each pixel as the (truncated) average of a and b.
 169     * There's no pixel to the left of the first pixel.  Luckily, it's
 170     * predicted to be half of the pixel above it.  So again, this works
 171     * perfectly with our loop if we make sure a starts at zero.
 172     */
 173    size_t rb;
 174    const __m128i zero = _mm_setzero_si128();
 175    __m128i    b;
 176    __m128i a, d = zero;
 177
 178    png_debug(1, "in png_read_filter_row_avg4_sse2");
 179
 180    rb = row_info->rowbytes+4;
 181    while (rb > 4) {
 182       __m128i avg;
 183              b = load4(prev);
 184       a = d; d = load4(row );
 185
 186       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
 187       avg = _mm_avg_epu8(a,b);
 188       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
 189       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
 190                                             _mm_set1_epi8(1)));
 191
 192       d = _mm_add_epi8(d, avg);
 193       store4(row, d);
 194
 195       prev += 4;
 196       row  += 4;
 197       rb   -= 4;
 198    }
 199 }
 200
 201 /* Returns |x| for 16-bit lanes. */
 202 static __m128i abs_i16(__m128i x) {
 203 #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
 204    return _mm_abs_epi16(x);
 205 #else
 206    /* Read this all as, return x<0 ? -x : x.
 207    * To negate two's complement, you flip all the bits then add 1.
 208     */
 209    __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
 210
 211    /* Flip negative lanes. */
 212    x = _mm_xor_si128(x, is_negative);
 213
 214    /* +1 to negative lanes, else +0. */
 215    x = _mm_sub_epi16(x, is_negative);
 216    return x;
 217 #endif
 218 }
 219
 220 /* Bytewise c ? t : e. */
 221 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
 222 #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
 223    return _mm_blendv_epi8(e,t,c);
 224 #else
 225    return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
 226 #endif
 227 }
 228
 229 void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
 230    png_const_bytep prev)
 231 {
 232    /* Paeth tries to predict pixel d using the pixel to the left of it, a,
 233     * and two pixels from the previous row, b and c:
 234     *   prev: c b
 235     *   row:  a d
 236     * The Paeth function predicts d to be whichever of a, b, or c is nearest to
 237     * p=a+b-c.
 238     *
 239     * The first pixel has no left context, and so uses an Up filter, p = b.
 240     * This works naturally with our main loop's p = a+b-c if we force a and c
 241     * to zero.
 242     * Here we zero b and d, which become c and a respectively at the start of
 243     * the loop.
 244     */
 245    size_t rb;
 246    const __m128i zero = _mm_setzero_si128();
 247    __m128i c, b = zero,
 248            a, d = zero;
 249
 250    png_debug(1, "in png_read_filter_row_paeth3_sse2");
 251
 252    rb = row_info->rowbytes;
 253    while (rb >= 4) {
 254       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
 255        * intermediates.
 256        */
 257       __m128i pa,pb,pc,smallest,nearest;
 258       c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
 259       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
 260
 261       /* (p-a) == (a+b-c - a) == (b-c) */
 262
 263       pa = _mm_sub_epi16(b,c);
 264
 265       /* (p-b) == (a+b-c - b) == (a-c) */
 266       pb = _mm_sub_epi16(a,c);
 267
 268       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
 269       pc = _mm_add_epi16(pa,pb);
 270
 271       pa = abs_i16(pa);  /* |p-a| */
 272       pb = abs_i16(pb);  /* |p-b| */
 273       pc = abs_i16(pc);  /* |p-c| */
 274
 275       smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
 276
 277       /* Paeth breaks ties favoring a over b over c. */
 278       nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
 279                  if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
 280                                                              c));
 281
 282       /* Note `_epi8`: we need addition to wrap modulo 255. */
 283       d = _mm_add_epi8(d, nearest);
 284       store3(row, _mm_packus_epi16(d,d));
 285
 286       prev += 3;
 287       row  += 3;
 288       rb   -= 3;
 289    }
 290    if (rb > 0) {
 291       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
 292        * intermediates.
 293        */
 294       __m128i pa,pb,pc,smallest,nearest;
 295       c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
 296       a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
 297
 298       /* (p-a) == (a+b-c - a) == (b-c) */
 299       pa = _mm_sub_epi16(b,c);
 300
 301       /* (p-b) == (a+b-c - b) == (a-c) */
 302       pb = _mm_sub_epi16(a,c);
 303
 304       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
 305       pc = _mm_add_epi16(pa,pb);
 306
 307       pa = abs_i16(pa);  /* |p-a| */
 308       pb = abs_i16(pb);  /* |p-b| */
 309       pc = abs_i16(pc);  /* |p-c| */
 310
 311       smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
 312
 313       /* Paeth breaks ties favoring a over b over c. */
 314       nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
 315                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
 316                                                                      c));
 317
 318       /* Note `_epi8`: we need addition to wrap modulo 255. */
 319       d = _mm_add_epi8(d, nearest);
 320       store3(row, _mm_packus_epi16(d,d));
 321
 322       prev += 3;
 323       row  += 3;
 324       rb   -= 3;
 325    }
 326 }
 327
 328 void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
 329    png_const_bytep prev)
 330 {
 331    /* Paeth tries to predict pixel d using the pixel to the left of it, a,
 332     * and two pixels from the previous row, b and c:
 333     *   prev: c b
 334     *   row:  a d
 335     * The Paeth function predicts d to be whichever of a, b, or c is nearest to
 336     * p=a+b-c.
 337     *
 338     * The first pixel has no left context, and so uses an Up filter, p = b.
 339     * This works naturally with our main loop's p = a+b-c if we force a and c
 340     * to zero.
 341     * Here we zero b and d, which become c and a respectively at the start of
 342     * the loop.
 343     */
 344    size_t rb;
 345    const __m128i zero = _mm_setzero_si128();
 346    __m128i pa,pb,pc,smallest,nearest;
 347    __m128i c, b = zero,
 348            a, d = zero;
 349
 350    png_debug(1, "in png_read_filter_row_paeth4_sse2");
 351
 352    rb = row_info->rowbytes+4;
 353    while (rb > 4) {
 354       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
 355        * intermediates.
 356        */
 357       c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
 358       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
 359
 360       /* (p-a) == (a+b-c - a) == (b-c) */
 361       pa = _mm_sub_epi16(b,c);
 362
 363       /* (p-b) == (a+b-c - b) == (a-c) */
 364       pb = _mm_sub_epi16(a,c);
 365
 366       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
 367       pc = _mm_add_epi16(pa,pb);
 368
 369       pa = abs_i16(pa);  /* |p-a| */
 370       pb = abs_i16(pb);  /* |p-b| */
 371       pc = abs_i16(pc);  /* |p-c| */
 372
 373       smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
 374
 375       /* Paeth breaks ties favoring a over b over c. */
 376       nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
 377                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
 378                                                                      c));
 379
 380       /* Note `_epi8`: we need addition to wrap modulo 255. */
 381       d = _mm_add_epi8(d, nearest);
 382       store4(row, _mm_packus_epi16(d,d));
 383
 384       prev += 4;
 385       row  += 4;
 386       rb   -= 4;
 387    }
 388 }
 389
 390 #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
 391 #endif /* READ */