using U8 = uint8_t __attribute__((ext_vector_type(4)));
// We polyfill a few routines that Clang doesn't build into ext_vector_types.
+ static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
static F min(F a, F b) { return vminq_f32(a,b); }
static F max(F a, F b) { return vmaxq_f32(a,b); }
- static F fma(F f, F m, F a) { return vfmaq_f32(a,f,m); }
static F rcp (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e ) * e; }
static F rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
- static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
static U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
+ static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#elif defined(__ARM_NEON__)
#if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
using U32 = uint32_t __attribute__((ext_vector_type(2)));
using U8 = uint8_t __attribute__((ext_vector_type(2)));
- static F min(F a, F b) { return vmin_f32(a,b); }
- static F max(F a, F b) { return vmax_f32(a,b); }
- static F fma(F f, F m, F a) { return vfma_f32(a,f,m); }
- static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
- static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
- static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
- static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+ static F fma(F f, F m, F a) { return vfma_f32(a,f,m); }
+ static F min(F a, F b) { return vmin_f32(a,b); }
+ static F max(F a, F b) { return vmax_f32(a,b); }
+ static F rcp (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e ) * e; }
+ static F rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+ static U32 round(F v, F scale) { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+
+ static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
-#else
- #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
- #error On x86, compile with -mavx2 -mfma -mf16c.
- #endif
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
#include <immintrin.h>
// These are __m256 and __m256i, but friendlier and strongly-typed.
using U32 = uint32_t __attribute__((ext_vector_type(8)));
using U8 = uint8_t __attribute__((ext_vector_type(8)));
- static F min(F a, F b) { return _mm256_min_ps (a,b); }
- static F max(F a, F b) { return _mm256_max_ps (a,b); }
- static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
- static F rcp (F v) { return _mm256_rcp_ps (v); }
- static F rsqrt(F v) { return _mm256_rsqrt_ps (v); }
- static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
- static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+ static F fma(F f, F m, F a) { return _mm256_fmadd_ps(f,m,a);}
+ static F min(F a, F b) { return _mm256_min_ps(a,b); }
+ static F max(F a, F b) { return _mm256_max_ps(a,b); }
+ static F rcp (F v) { return _mm256_rcp_ps (v); }
+ static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
+ static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+ static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
+#elif defined(__SSE2__)
+ #include <immintrin.h>
+
+ using F = float __attribute__((ext_vector_type(4)));
+ using I32 = int32_t __attribute__((ext_vector_type(4)));
+ using U32 = uint32_t __attribute__((ext_vector_type(4)));
+ using U8 = uint8_t __attribute__((ext_vector_type(4)));
+
+ static F fma(F f, F m, F a) { return f*m+a; }
+ static F min(F a, F b) { return _mm_min_ps(a,b); }
+ static F max(F a, F b) { return _mm_max_ps(a,b); }
+ static F rcp (F v) { return _mm_rcp_ps (v); }
+ static F rsqrt(F v) { return _mm_rsqrt_ps(v); }
+ static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
+
+ static F if_then_else(I32 c, F t, F e) {
+ #if defined(__SSE4_1__)
+ return _mm_blendv_ps(e,t,c);
+ #else
+ return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
+ #endif
+ }
+
+ static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
#endif
static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
g = {ga[0], ga[2]};
b = {rb[1], rb[3]};
a = {ga[1], ga[3]};
-#else
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
_23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
_45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#elif defined(__SSE2__)
+ auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+ _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+
+ auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
+ _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
+
+ auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
+ ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
+
+ auto half_to_float = [&](U32 h) {
+ return (F)(h << 13) // Line up the mantissa,
+ * (F)U32(k->_0x77800000); // then fix up the exponent.
+ };
+
+ r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
+ g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
+ b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
+ a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
#endif
}
vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
}};
vst2_f16((float16_t*)ptr, rb_ga);
-#else
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
_mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
_mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
_mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#elif defined(__SSE2__)
+ auto float_to_half = [&](F f) {
+ return (U32)(f * (F)U32(k->_0x07800000)) // Fix up the exponent,
+ >> 13; // then line up the mantissa.
+ };
+ U32 R = float_to_half(r),
+ G = float_to_half(g),
+ B = float_to_half(b),
+ A = float_to_half(a);
+ U32 rg = R | _mm_slli_si128(G,2),
+ ba = B | _mm_slli_si128(A,2);
+ _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+ _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
#endif
}