2 /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
4 * Copyright (c) 2018 Cosmin Truta
5 * Copyright (c) 2016-2017 Glenn Randers-Pehrson
6 * Written by Mike Klein and Matt Sarett
7 * Derived from arm/filter_neon_intrinsics.c
9 * This code is released under the libpng license.
10 * For conditions of distribution and use, see the disclaimer
11 * and license in png.h
14 #include "../pngpriv.h"
16 #ifdef PNG_READ_SUPPORTED
18 #if PNG_INTEL_SSE_IMPLEMENTATION > 0
20 #include <immintrin.h>
22 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
23 * They're positioned like this:
26 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
27 * whichever of a, b, or c is closest to p=a+b-c.
30 static __m128i load4(const void* p) {
32 memcpy(&tmp, p, sizeof(tmp));
33 return _mm_cvtsi32_si128(tmp);
36 static void store4(void* p, __m128i v) {
37 int tmp = _mm_cvtsi128_si32(v);
38 memcpy(p, &tmp, sizeof(int));
41 static __m128i load3(const void* p) {
44 return _mm_cvtsi32_si128(tmp);
47 static void store3(void* p, __m128i v) {
48 int tmp = _mm_cvtsi128_si32(v);
52 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
55 /* The Sub filter predicts each pixel as the previous pixel, a.
56 * There is no pixel to the left of the first pixel. It's encoded directly.
57 * That works with our main loop if we just say that left pixel was zero.
61 __m128i a, d = _mm_setzero_si128();
63 png_debug(1, "in png_read_filter_row_sub3_sse2");
65 rb = row_info->rowbytes;
67 a = d; d = load4(row);
68 d = _mm_add_epi8(d, a);
75 a = d; d = load3(row);
76 d = _mm_add_epi8(d, a);
85 void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
88 /* The Sub filter predicts each pixel as the previous pixel, a.
89 * There is no pixel to the left of the first pixel. It's encoded directly.
90 * That works with our main loop if we just say that left pixel was zero.
94 __m128i a, d = _mm_setzero_si128();
96 png_debug(1, "in png_read_filter_row_sub4_sse2");
98 rb = row_info->rowbytes+4;
100 a = d; d = load4(row);
101 d = _mm_add_epi8(d, a);
110 void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
111 png_const_bytep prev)
113 /* The Avg filter predicts each pixel as the (truncated) average of a and b.
114 * There's no pixel to the left of the first pixel. Luckily, it's
115 * predicted to be half of the pixel above it. So again, this works
116 * perfectly with our loop if we make sure a starts at zero.
121 const __m128i zero = _mm_setzero_si128();
126 png_debug(1, "in png_read_filter_row_avg3_sse2");
127 rb = row_info->rowbytes;
131 a = d; d = load4(row );
133 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
134 avg = _mm_avg_epu8(a,b);
135 /* ...but we can fix it up by subtracting off 1 if it rounded up. */
136 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
138 d = _mm_add_epi8(d, avg);
148 a = d; d = load3(row );
150 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
151 avg = _mm_avg_epu8(a,b);
152 /* ...but we can fix it up by subtracting off 1 if it rounded up. */
153 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
156 d = _mm_add_epi8(d, avg);
165 void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
166 png_const_bytep prev)
168 /* The Avg filter predicts each pixel as the (truncated) average of a and b.
169 * There's no pixel to the left of the first pixel. Luckily, it's
170 * predicted to be half of the pixel above it. So again, this works
171 * perfectly with our loop if we make sure a starts at zero.
174 const __m128i zero = _mm_setzero_si128();
178 png_debug(1, "in png_read_filter_row_avg4_sse2");
180 rb = row_info->rowbytes+4;
184 a = d; d = load4(row );
186 /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
187 avg = _mm_avg_epu8(a,b);
188 /* ...but we can fix it up by subtracting off 1 if it rounded up. */
189 avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
192 d = _mm_add_epi8(d, avg);
201 /* Returns |x| for 16-bit lanes. */
202 static __m128i abs_i16(__m128i x) {
203 #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
204 return _mm_abs_epi16(x);
206 /* Read this all as, return x<0 ? -x : x.
207 * To negate two's complement, you flip all the bits then add 1.
209 __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
211 /* Flip negative lanes. */
212 x = _mm_xor_si128(x, is_negative);
214 /* +1 to negative lanes, else +0. */
215 x = _mm_sub_epi16(x, is_negative);
220 /* Bytewise c ? t : e. */
221 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
222 #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
223 return _mm_blendv_epi8(e,t,c);
225 return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
229 void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
230 png_const_bytep prev)
232 /* Paeth tries to predict pixel d using the pixel to the left of it, a,
233 * and two pixels from the previous row, b and c:
236 * The Paeth function predicts d to be whichever of a, b, or c is nearest to
239 * The first pixel has no left context, and so uses an Up filter, p = b.
240 * This works naturally with our main loop's p = a+b-c if we force a and c
242 * Here we zero b and d, which become c and a respectively at the start of
246 const __m128i zero = _mm_setzero_si128();
250 png_debug(1, "in png_read_filter_row_paeth3_sse2");
252 rb = row_info->rowbytes;
254 /* It's easiest to do this math (particularly, deal with pc) with 16-bit
257 __m128i pa,pb,pc,smallest,nearest;
258 c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
259 a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
261 /* (p-a) == (a+b-c - a) == (b-c) */
263 pa = _mm_sub_epi16(b,c);
265 /* (p-b) == (a+b-c - b) == (a-c) */
266 pb = _mm_sub_epi16(a,c);
268 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
269 pc = _mm_add_epi16(pa,pb);
271 pa = abs_i16(pa); /* |p-a| */
272 pb = abs_i16(pb); /* |p-b| */
273 pc = abs_i16(pc); /* |p-c| */
275 smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
277 /* Paeth breaks ties favoring a over b over c. */
278 nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
279 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
282 /* Note `_epi8`: we need addition to wrap modulo 255. */
283 d = _mm_add_epi8(d, nearest);
284 store3(row, _mm_packus_epi16(d,d));
291 /* It's easiest to do this math (particularly, deal with pc) with 16-bit
294 __m128i pa,pb,pc,smallest,nearest;
295 c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
296 a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
298 /* (p-a) == (a+b-c - a) == (b-c) */
299 pa = _mm_sub_epi16(b,c);
301 /* (p-b) == (a+b-c - b) == (a-c) */
302 pb = _mm_sub_epi16(a,c);
304 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
305 pc = _mm_add_epi16(pa,pb);
307 pa = abs_i16(pa); /* |p-a| */
308 pb = abs_i16(pb); /* |p-b| */
309 pc = abs_i16(pc); /* |p-c| */
311 smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
313 /* Paeth breaks ties favoring a over b over c. */
314 nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
315 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
318 /* Note `_epi8`: we need addition to wrap modulo 255. */
319 d = _mm_add_epi8(d, nearest);
320 store3(row, _mm_packus_epi16(d,d));
328 void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
329 png_const_bytep prev)
331 /* Paeth tries to predict pixel d using the pixel to the left of it, a,
332 * and two pixels from the previous row, b and c:
335 * The Paeth function predicts d to be whichever of a, b, or c is nearest to
338 * The first pixel has no left context, and so uses an Up filter, p = b.
339 * This works naturally with our main loop's p = a+b-c if we force a and c
341 * Here we zero b and d, which become c and a respectively at the start of
345 const __m128i zero = _mm_setzero_si128();
346 __m128i pa,pb,pc,smallest,nearest;
350 png_debug(1, "in png_read_filter_row_paeth4_sse2");
352 rb = row_info->rowbytes+4;
354 /* It's easiest to do this math (particularly, deal with pc) with 16-bit
357 c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
358 a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
360 /* (p-a) == (a+b-c - a) == (b-c) */
361 pa = _mm_sub_epi16(b,c);
363 /* (p-b) == (a+b-c - b) == (a-c) */
364 pb = _mm_sub_epi16(a,c);
366 /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
367 pc = _mm_add_epi16(pa,pb);
369 pa = abs_i16(pa); /* |p-a| */
370 pb = abs_i16(pb); /* |p-b| */
371 pc = abs_i16(pc); /* |p-c| */
373 smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
375 /* Paeth breaks ties favoring a over b over c. */
376 nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
377 if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
380 /* Note `_epi8`: we need addition to wrap modulo 255. */
381 d = _mm_add_epi8(d, nearest);
382 store4(row, _mm_packus_epi16(d,d));
390 #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */