libswrKNL_la_CXXFLAGS = \
$(PTHREAD_CFLAGS) \
$(SWR_KNL_CXXFLAGS) \
- -DKNOB_ARCH=KNOB_ARCH_AVX512 -DAVX512F_STRICT \
+ -DKNOB_ARCH=KNOB_ARCH_AVX512 -DSIMD_ARCH_KNIGHTS \
$(COMMON_CXXFLAGS)
libswrKNL_la_SOURCES = \
rasterizer/common/simdlib_128_avx.inl \
rasterizer/common/simdlib_128_avx2.inl \
rasterizer/common/simdlib_128_avx512.inl \
+ rasterizer/common/simdlib_128_avx512_core.inl \
+ rasterizer/common/simdlib_128_avx512_knights.inl \
rasterizer/common/simdlib_256_avx.inl \
rasterizer/common/simdlib_256_avx2.inl \
rasterizer/common/simdlib_256_avx512.inl \
+ rasterizer/common/simdlib_256_avx512_core.inl \
+ rasterizer/common/simdlib_256_avx512_knights.inl \
rasterizer/common/simdlib_512_avx512.inl \
+ rasterizer/common/simdlib_512_avx512_core.inl \
+ rasterizer/common/simdlib_512_avx512_knights.inl \
rasterizer/common/simdlib_512_avx512_masks.inl \
+ rasterizer/common/simdlib_512_avx512_masks_core.inl \
+ rasterizer/common/simdlib_512_avx512_masks_knights.inl \
rasterizer/common/simdlib_512_emu.inl \
rasterizer/common/simdlib_512_emu_masks.inl \
rasterizer/common/simdlib_interface.hpp \
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_128_avx512.inl"
+#if defined(SIMD_ARCH_KNIGHTS)
+#include "simdlib_128_avx512_knights.inl"
+#else // optimize for core
+#include "simdlib_128_avx512_core.inl"
+#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_256_avx512.inl"
+#if defined(SIMD_ARCH_KNIGHTS)
+#include "simdlib_256_avx512_knights.inl"
+#else // optimize for core
+#include "simdlib_256_avx512_core.inl"
+#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
}; // struct AVX2Impl
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
#if SIMD_ARCH >= SIMD_ARCH_AVX512
- struct AVX512Impl
+ struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
{
#define __SIMD_LIB_AVX512_HPP__
#include "simdlib_512_avx512.inl"
#include "simdlib_512_avx512_masks.inl"
+#if defined(SIMD_ARCH_KNIGHTS)
+#include "simdlib_512_avx512_knights.inl"
+#include "simdlib_512_avx512_masks_knights.inl"
+#else // optimize for core
+#include "simdlib_512_avx512_core.inl"
+#include "simdlib_512_avx512_masks_core.inl"
+#endif // defined(SIMD_ARCH_KNIGHTS)
#undef __SIMD_LIB_AVX512_HPP__
- }; // struct AVX512Impl
+ }; // struct AVX512ImplBase
#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
struct Traits : SIMDImpl::Traits
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
-#define SIMD_DWRAPPER_1_(op, intrin, mask) \
- static SIMDINLINE Double SIMDCALL op(Double a) \
- {\
- return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
- }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
-#endif
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
- template<int ImmT> \
- static SIMDINLINE Double SIMDCALL op(Double a) \
- {\
- return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
- }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
-#endif
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask) \
- static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
- {\
- return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
- }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
-#endif
-
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
-#endif
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
-#endif
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
-#endif
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-
-#endif
+// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
+// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
// return (a * b) & 0xFFFFFFFF
//
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-#endif
+// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
+// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-#endif
+// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-#endif
+// SIMD_IWRAPPER_2_16(unpackhi_epi16);
+// SIMD_IWRAPPER_2_64(unpackhi_epi64);
+// SIMD_IWRAPPER_2_8(unpackhi_epi8);
+// SIMD_IWRAPPER_2_16(unpacklo_epi16);
+// SIMD_IWRAPPER_2_64(unpacklo_epi64);
+// SIMD_IWRAPPER_2_8(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
static_cast<int>(ScaleT)));
}
-#if !defined(AVX512F_STRICT)
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
- __mmask64 m = 0xffffull;
- return static_cast<uint32_t>(
- _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#endif
+// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+// {
+// __mmask64 m = 0xffffull;
+// return static_cast<uint32_t>(
+// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+// }
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
_mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
}
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+ return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xf), -1)));
+}
+
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+#define SIMD_WRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+ }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+ }
+#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+
+#define SIMD_DWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+ }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+
+#define SIMD_IWRAPPER_2I(op) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+ }
+
+SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ __mmask64 m = 0xffffull;
+ return static_cast<uint32_t>(
+ _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation for Knights Family
+//
+// Since this implementation inherits from the AVX512Base implementation,
+// the only operations below ones that replace AVX512F / AVX512CD operations
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
}
#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
-#define SIMD_DWRAPPER_1_(op, intrin, mask) \
- static SIMDINLINE Double SIMDCALL op(Double a) \
- {\
- return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
- }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
-#endif
-
-#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
- template<int ImmT> \
- static SIMDINLINE Double SIMDCALL op(Double a) \
- {\
- return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
- }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
-#endif
-
-#define SIMD_DWRAPPER_2_(op, intrin, mask) \
- static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
- {\
- return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
- }
-#if !defined(AVX512F_STRICT)
-#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
-#endif
-
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
}
#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
-#endif
#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
template<int ImmT> \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
}
#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
-#endif
#define SIMD_IWRAPPER_2_(op, intrin, mask) \
static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
}
#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
-#if !defined(AVX512F_STRICT)
-#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
-#endif
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
-
-#endif
+// SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
+// SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
// return (a * b) & 0xFFFFFFFF
//
SIMD_IWRAPPER_2_32(mullo_epi32);
SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
-
-#endif
+// SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
+// SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
//-----------------------------------------------------------------------
// Logical operations
//-----------------------------------------------------------------------
// Blend / shuffle / permute operations
//-----------------------------------------------------------------------
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-
-#endif
+// SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+// SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+// SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
SIMD_IWRAPPER_2_32(unpackhi_epi32);
SIMD_IWRAPPER_2_32(unpacklo_epi32);
-#if !defined(AVX512F_STRICT)
-
-SIMD_IWRAPPER_2_16(unpackhi_epi16);
-SIMD_IWRAPPER_2_64(unpackhi_epi64);
-SIMD_IWRAPPER_2_8(unpackhi_epi8);
-SIMD_IWRAPPER_2_16(unpacklo_epi16);
-SIMD_IWRAPPER_2_64(unpacklo_epi64);
-SIMD_IWRAPPER_2_8(unpacklo_epi8);
-
-#endif
+// SIMD_IWRAPPER_2_16(unpackhi_epi16);
+// SIMD_IWRAPPER_2_64(unpackhi_epi64);
+// SIMD_IWRAPPER_2_8(unpackhi_epi8);
+// SIMD_IWRAPPER_2_16(unpacklo_epi16);
+// SIMD_IWRAPPER_2_64(unpacklo_epi64);
+// SIMD_IWRAPPER_2_8(unpacklo_epi8);
//-----------------------------------------------------------------------
// Load / store operations
static_cast<int>(ScaleT)));
}
-#if !defined(AVX512F_STRICT)
-
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
-{
- __mmask64 m = 0xffffffffull;
- return static_cast<uint32_t>(
- _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
-}
-
-#endif
+// static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+// {
+// __mmask64 m = 0xffffffffull;
+// return static_cast<uint32_t>(
+// _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+// }
static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
{
_mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
}
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+ return castsi_ps(__conv(_mm512_maskz_set1_epi32(__mmask16(mask & 0xff), -1)));
+}
+
//=======================================================================
// Legacy interface (available only in SIMD256 width)
//=======================================================================
#undef SIMD_WRAPPER_2I
#undef SIMD_WRAPPER_3_
#undef SIMD_WRAPPER_3
-#undef SIMD_DWRAPPER_1_
-#undef SIMD_DWRAPPER_1
-#undef SIMD_DWRAPPER_1I_
-#undef SIMD_DWRAPPER_1I
-#undef SIMD_DWRAPPER_2_
-#undef SIMD_DWRAPPER_2
-#undef SIMD_DWRAPPER_2I
#undef SIMD_IWRAPPER_1_
-#undef SIMD_IWRAPPER_1_8
-#undef SIMD_IWRAPPER_1_16
#undef SIMD_IWRAPPER_1_32
-#undef SIMD_IWRAPPER_1_64
#undef SIMD_IWRAPPER_1I_
-#undef SIMD_IWRAPPER_1I_8
-#undef SIMD_IWRAPPER_1I_16
#undef SIMD_IWRAPPER_1I_32
-#undef SIMD_IWRAPPER_1I_64
#undef SIMD_IWRAPPER_2_
-#undef SIMD_IWRAPPER_2_8
-#undef SIMD_IWRAPPER_2_16
#undef SIMD_IWRAPPER_2_32
-#undef SIMD_IWRAPPER_2_64
#undef SIMD_IWRAPPER_2I
-//#undef SIMD_IWRAPPER_2I_8
-//#undef SIMD_IWRAPPER_2I_16
-//#undef SIMD_IWRAPPER_2I_32
-//#undef SIMD_IWRAPPER_2I_64
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation for Core processors
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Double SIMDCALL op(Double a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+ }
+#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+ }
+#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+ }
+#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+
+
+SIMD_IWRAPPER_2_8(add_epi8); // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2_64(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_IWRAPPER_2_8(packs_epi16); // int16 --> int8 See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32); // int32 --> int16 See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16); // uint16 --> uint8 See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+ __mmask64 m = 0xffffffffull;
+ return static_cast<uint32_t>(
+ _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_64
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation for Knights Family
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
#endif
#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
-// gcc missing these intrinsics
+// gcc as of 7.1 was missing these intrinsics
#ifndef _mm512_cmpneq_ps_mask
#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
#endif
#ifndef _mm512_cmplt_pd_mask
#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
#endif
+
#endif
//============================================================================
-// SIMD16 AVX512 (F) implementation
+// SIMD16 AVX512 (F) implementation (compatible with Knights and Core
+// processors)
//
-// TODO: Optimize for KNL / KNH or for SKX??
-// For now probably optimizing more for KNL as that's where
-// immediate customers are.
//============================================================================
static const int TARGET_SIMD_WIDTH = 16;
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
-#define SIMD_EMU_IWRAPPER_2(op) \
- static SIMDINLINE \
- Integer SIMDCALL op(Integer a, Integer b)\
- {\
- return Integer\
- {\
- SIMD256T::op(a.v8[0], b.v8[0]),\
- SIMD256T::op(a.v8[1], b.v8[1]),\
- };\
- }
-
private:
- static SIMDINLINE Integer vmask(__mmask8 m)
- {
- return _mm512_maskz_set1_epi64(m, -1LL);
- }
static SIMDINLINE Integer vmask(__mmask16 m)
{
return _mm512_maskz_set1_epi32(m, -1);
}
- static SIMDINLINE Integer vmask(__mmask32 m)
- {
- return _mm512_maskz_set1_epi16(m, -1);
- }
- static SIMDINLINE Integer vmask(__mmask64 m)
- {
- return _mm512_maskz_set1_epi8(m, -1);
- }
public:
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2_(or_si, or_si512); // return a | b (int)
SIMD_IWRAPPER_2_(xor_si, xor_si512); // return a ^ b (int)
-#if defined(AVX512F_STRICT)
-
-SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
-SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
-SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
-SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
-
-#else
-
-SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
-SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
-SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
-SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
-
-#endif
+// SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+// SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+// SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+// SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
//-----------------------------------------------------------------------
SIMD_IWRAPPER_2(sllv_epi32);
SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT (int32)
SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT (uint32)
+
+#if 0
+SIMD_IWRAPPER_1I_(srli_si, srli_si512); // return a >> (ImmT*8) (uint)
+
+template<int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+ return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+#endif
+
SIMD_IWRAPPER_2(srlv_epi32);
//-----------------------------------------------------------------------
return _mm512_inserti64x4(a, b, imm);
}
-#if !defined(AVX512F_STRICT)
-SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
-#else
-SIMD_EMU_IWRAPPER_2(packs_epi16)
-SIMD_EMU_IWRAPPER_2(packs_epi32)
-SIMD_EMU_IWRAPPER_2(packus_epi16)
-SIMD_EMU_IWRAPPER_2(packus_epi32)
-#endif
+// SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+// SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+// SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
#undef SIMD_IWRAPPER_2
#undef SIMD_IWRAPPER_2_
#undef SIMD_IWRAPPER_2I
-#undef SIMD_EMU_IWRAPPER_2
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation for Core processors
+//
+//============================================================================
+
+#define SIMD_WRAPPER_1_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return intrin(a);\
+ }
+
+#define SIMD_WRAPPER_1(op) \
+ SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_##intrin(a, b);\
+ }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_castsi512_ps(_mm512_##intrin(\
+ _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+ }
+
+#define SIMD_DWRAPPER_2(op) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm512_##op(a, b);\
+ }
+
+#define SIMD_WRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return _mm512_##op(a, b, c);\
+ }
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+#define SIMD_IWRAPPER_1_8(op) \
+ static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1_4(op) \
+ static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1I_(op, intrin) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return intrin(a, ImmT);\
+ }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm512_##intrin(a, b);\
+ }
+#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return cmp(a, b);\
+ }
+
+#define SIMD_IFWRAPPER_2(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+ }
+
+#define SIMD_IWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+ static SIMDINLINE Integer vmask(__mmask8 m)
+ {
+ return _mm512_maskz_set1_epi64(m, -1LL);
+ }
+ static SIMDINLINE Integer vmask(__mmask32 m)
+ {
+ return _mm512_maskz_set1_epi16(m, -1);
+ }
+ static SIMDINLINE Integer vmask(__mmask64 m)
+ {
+ return _mm512_maskz_set1_epi8(m, -1);
+ }
+
+public:
+SIMD_WRAPPER_2(and_ps); // return a & b (float treated as int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b (float treated as int)
+SIMD_WRAPPER_2(or_ps); // return a | b (float treated as int)
+SIMD_WRAPPER_2(xor_ps); // return a ^ b (float treated as int)
+
+SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32); // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation for Knights Family Processors
+//
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a) \
+ {\
+ return intrin(a);\
+ }
+
+#define SIMD_WRAPPER_1(op) \
+ SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_##intrin(a, b);\
+ }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_castsi512_ps(_mm512_##intrin(\
+ _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+ }
+
+#define SIMD_DWRAPPER_2(op) \
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm512_##op(a, b);\
+ }
+
+#define SIMD_WRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op) \
+ static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ {\
+ return _mm512_##op(a, b, c);\
+ }
+
+#define SIMD_IWRAPPER_1(op) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+#define SIMD_IWRAPPER_1_8(op) \
+ static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1_4(op) \
+ static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) \
+ {\
+ return _mm512_##op(a);\
+ }
+
+#define SIMD_IWRAPPER_1I_(op, intrin) \
+ template<int ImmT> \
+ static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ {\
+ return intrin(a, ImmT);\
+ }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm512_##intrin(a, b);\
+ }
+#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return cmp(a, b);\
+ }
+
+#define SIMD_IFWRAPPER_2(op, intrin) \
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+ }
+
+#define SIMD_IWRAPPER_2I_(op, intrin) \
+ template<int ImmT>\
+ static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ {\
+ return _mm512_##intrin(a, b, ImmT);\
+ }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+ static SIMDINLINE Integer vmask(__mmask8 m)
+ {
+ return _mm512_maskz_set1_epi64(m, -1LL);
+ }
+ static SIMDINLINE Integer vmask(__mmask16 m)
+ {
+ return _mm512_maskz_set1_epi32(m, -1);
+ }
+ static SIMDINLINE Integer vmask(__mmask32 m)
+ {
+ return _mm512_maskz_set1_epi16(m, -1);
+ }
+ static SIMDINLINE Integer vmask(__mmask64 m)
+ {
+ return _mm512_maskz_set1_epi8(m, -1);
+ }
+
+public:
+SIMD_WRAPPERI_2_(and_ps, and_epi32); // return a & b (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32); // return a | b (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32); // return a ^ b (float treated as int)
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
--- /dev/null
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
namespace SIMD512Impl
{
-#if !defined(__AVX512F__)
+#if !(defined(__AVX512F__) || defined(_MM_K0_REG))
// Define AVX512 types if not included via immintrin.h.
// All data members of these types are ONLY to viewed
// in a debugger. Do NOT access them via code!