From 0e49963212fb85e4fb83c3d4003907e232f151bd Mon Sep 17 00:00:00 2001 From: Alok Hota Date: Tue, 19 Jun 2018 17:22:32 -0500 Subject: [PATCH] swr/rast: AVX512 support compiled in by default - Emulation of AVX512 built into SIMDLIB - Remove associated macros - Remove knobs controlling AVX512 and let emulation handle it - Refactor variable names for SIMD16 Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/rasterizer/common/os.h | 2 - .../drivers/swr/rasterizer/common/simd16intrin.h | 4 - .../drivers/swr/rasterizer/common/simdintrin.h | 2 - src/gallium/drivers/swr/rasterizer/core/context.h | 2 - .../swr/rasterizer/core/format_conversion.h | 287 +++++---------- .../drivers/swr/rasterizer/core/format_types.h | 165 ++++----- .../drivers/swr/rasterizer/core/format_utils.h | 392 +++++++++------------ src/gallium/drivers/swr/rasterizer/core/knobs.h | 17 +- src/gallium/drivers/swr/rasterizer/core/state.h | 14 +- src/gallium/drivers/swr/rasterizer/core/utils.h | 2 - .../drivers/swr/rasterizer/memory/StoreTile.h | 4 +- .../swr/rasterizer/memory/TilingFunctions.h | 2 - 12 files changed, 333 insertions(+), 560 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index b00beeb..e812da3 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -265,9 +265,7 @@ typedef MEGABYTE GIGABYTE[1024]; #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) #define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES) -#if ENABLE_AVX512_SIMD16 #define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES) -#endif #include "common/swr_assert.h" diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index b08fb2e..5964edf 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -24,8 +24,6 @@ #ifndef __SWR_SIMD16INTRIN_H__ #define __SWR_SIMD16INTRIN_H__ -#if ENABLE_AVX512_SIMD16 - #if KNOB_SIMD16_WIDTH == 16 typedef SIMD512 SIMD16; #else @@ -167,6 +165,4 @@ typedef SIMD512 SIMD16; #define _simd16_mask2int(mask) int(mask) #define _simd16_vmask_ps SIMD16::vmask_ps -#endif // ENABLE_AVX512_SIMD16 - #endif //__SWR_SIMD16INTRIN_H_ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 8ffda3f..5eae34e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -341,8 +341,6 @@ static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a) return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); } -#if ENABLE_AVX512_SIMD16 #include "simd16intrin.h" -#endif // ENABLE_AVX512_SIMD16 #endif //__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index efbddb0..a818255 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -230,7 +230,6 @@ typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC, simdscalari const& viewportIdx, simdscalari const& rtIdx); -#if ENABLE_AVX512_SIMD16 // function signature for pipeline stages that execute after primitive assembly typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC, PA_STATE& pa, @@ -241,7 +240,6 @@ typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC, simd16scalari const& viewportIdx, simd16scalari const& rtIdx); -#endif OSALIGNLINE(struct) API_STATE { // Vertex Buffers diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h index 90bf118..247ba0b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h @@ -33,15 +33,17 @@ /// SOA RGBA32_FLOAT format. /// @param pSrc - source data in SOA form /// @param dst - output data in SOA form -template -INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) +template +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4& dst) { // fast path for float32 if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits::GetBPC(0) == 32)) { - auto lambda = [&](int comp) { - simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp * sizeof(simdscalar))); + auto lambda = [&](int comp) + { + Float vComp = + SIMD_T::load_ps(reinterpret_cast(pSrc + comp * sizeof(Float))); dst.v[FormatTraits::swizzle(comp)] = vComp; }; @@ -50,9 +52,11 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) return; } - auto lambda = [&](int comp) { + auto lambda = [&](int comp) + { // load SIMD components - simdscalar vComp = FormatTraits::loadSOA(comp, pSrc); + Float vComp; + FormatTraits::loadSOA(comp, pSrc, vComp); // unpack vComp = FormatTraits::unpack(comp, vComp); @@ -60,250 +64,119 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) // convert if (FormatTraits::isNormalized(comp)) { - vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp)); - vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::toFloat(comp))); + vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp)); + vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits::toFloat(comp))); } dst.v[FormatTraits::swizzle(comp)] = vComp; - pSrc += (FormatTraits::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; + // is there a better way to get this from the SIMD traits? + const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float); + + pSrc += (FormatTraits::GetBPC(comp) * SIMD_WIDTH) / 8; }; UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); } +template +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst) +{ + LoadSOA(pSrc, dst); +} + +template +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst) +{ + LoadSOA(pSrc, dst); +} + ////////////////////////////////////////////////////////////////////////// /// @brief Clamps the given component based on the requirements on the /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template -INLINE simdscalar Clamp(simdscalar const& vC, uint32_t Component) +template +INLINE Float SIMDCALL Clamp(Float const& v, uint32_t Component) { - simdscalar vComp = vC; + Float vComp = v; if (FormatTraits::isNormalized(Component)) { if (FormatTraits::GetType(Component) == SWR_TYPE_UNORM) { - vComp = _simd_max_ps(vComp, _simd_setzero_ps()); + vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps()); } if (FormatTraits::GetType(Component) == SWR_TYPE_SNORM) { - vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f)); + vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f)); } - vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f)); + vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f)); } else if (FormatTraits::GetBPC(Component) < 32) { if (FormatTraits::GetType(Component) == SWR_TYPE_UINT) { - int iMax = (1 << FormatTraits::GetBPC(Component)) - 1; - int iMin = 0; - simdscalari vCompi = _simd_castps_si(vComp); - vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); - vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); - vComp = _simd_castsi_ps(vCompi); + int iMax = (1 << FormatTraits::GetBPC(Component)) - 1; + int iMin = 0; + Integer vCompi = SIMD_T::castps_si(vComp); + vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin)); + vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax)); + vComp = SIMD_T::castsi_ps(vCompi); } else if (FormatTraits::GetType(Component) == SWR_TYPE_SINT) { - int iMax = (1 << (FormatTraits::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; - simdscalari vCompi = _simd_castps_si(vComp); - vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); - vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); - vComp = _simd_castsi_ps(vCompi); + int iMax = (1 << (FormatTraits::GetBPC(Component) - 1)) - 1; + int iMin = -1 - iMax; + Integer vCompi = SIMD_T::castps_si(vComp); + vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin)); + vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax)); + vComp = SIMD_T::castsi_ps(vCompi); } } return vComp; } -////////////////////////////////////////////////////////////////////////// -/// @brief Normalize the given component based on the requirements on the -/// Format template arg -/// @param vComp - SIMD vector of floats -/// @param Component - component template -INLINE simdscalar Normalize(simdscalar const& vC, uint32_t Component) +INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component) { - simdscalar vComp = vC; - if (FormatTraits::isNormalized(Component)) - { - vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits::fromFloat(Component))); - vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); - } - return vComp; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert and store simdvector of pixels in SOA -/// RGBA32_FLOAT to SOA format -/// @param src - source data in SOA form -/// @param dst - output data in SOA form -template -INLINE void StoreSOA(const simdvector& src, uint8_t* pDst) -{ - // fast path for float32 - if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && - (FormatTraits::GetBPC(0) == 32)) - { - for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) - { - simdscalar vComp = src.v[FormatTraits::swizzle(comp)]; - - // Gamma-correct - if (FormatTraits::isSRGB) - { - if (comp < 3) // Input format is always RGBA32_FLOAT. - { - vComp = FormatTraits::convertSrgb(comp, vComp); - } - } - - _simd_store_ps((float*)(pDst + comp * sizeof(simdscalar)), vComp); - } - return; - } - - auto lambda = [&](int comp) { - simdscalar vComp = src.v[FormatTraits::swizzle(comp)]; - - // Gamma-correct - if (FormatTraits::isSRGB) - { - if (comp < 3) // Input format is always RGBA32_FLOAT. - { - vComp = FormatTraits::convertSrgb(comp, vComp); - } - } - - // clamp - vComp = Clamp(vComp, comp); - - // normalize - vComp = Normalize(vComp, comp); - - // pack - vComp = FormatTraits::pack(comp, vComp); - - // store - FormatTraits::storeSOA(comp, pDst, vComp); - - pDst += (FormatTraits::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; - }; - - UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); + return Clamp(v, Component); } -#if ENABLE_AVX512_SIMD16 -////////////////////////////////////////////////////////////////////////// -/// @brief Load SIMD packed pixels in SOA format and converts to -/// SOA RGBA32_FLOAT format. -/// @param pSrc - source data in SOA form -/// @param dst - output data in SOA form -template -INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst) +template +INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component) { - // fast path for float32 - if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && - (FormatTraits::GetBPC(0) == 32)) - { - auto lambda = [&](int comp) { - simd16scalar vComp = - _simd16_load_ps(reinterpret_cast(pSrc + comp * sizeof(simd16scalar))); - - dst.v[FormatTraits::swizzle(comp)] = vComp; - }; - - UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); - return; - } - - auto lambda = [&](int comp) { - // load SIMD components - simd16scalar vComp = FormatTraits::loadSOA_16(comp, pSrc); - - // unpack - vComp = FormatTraits::unpack(comp, vComp); - - // convert - if (FormatTraits::isNormalized(comp)) - { - vComp = _simd16_cvtepi32_ps(_simd16_castps_si(vComp)); - vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits::toFloat(comp))); - } - - dst.v[FormatTraits::swizzle(comp)] = vComp; - - pSrc += (FormatTraits::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8; - }; - - UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); + return Clamp(v, Component); } ////////////////////////////////////////////////////////////////////////// -/// @brief Clamps the given component based on the requirements on the +/// @brief Normalize the given component based on the requirements on the /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template -INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component) +template +INLINE Float SIMDCALL Normalize(Float const& vComp, uint32_t Component) { - simd16scalar vComp = v; + Float r = vComp; if (FormatTraits::isNormalized(Component)) { - if (FormatTraits::GetType(Component) == SWR_TYPE_UNORM) - { - vComp = _simd16_max_ps(vComp, _simd16_setzero_ps()); - } - - if (FormatTraits::GetType(Component) == SWR_TYPE_SNORM) - { - vComp = _simd16_max_ps(vComp, _simd16_set1_ps(-1.0f)); - } - vComp = _simd16_min_ps(vComp, _simd16_set1_ps(1.0f)); - } - else if (FormatTraits::GetBPC(Component) < 32) - { - if (FormatTraits::GetType(Component) == SWR_TYPE_UINT) - { - int iMax = (1 << FormatTraits::GetBPC(Component)) - 1; - int iMin = 0; - simd16scalari vCompi = _simd16_castps_si(vComp); - vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin)); - vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax)); - vComp = _simd16_castsi_ps(vCompi); - } - else if (FormatTraits::GetType(Component) == SWR_TYPE_SINT) - { - int iMax = (1 << (FormatTraits::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; - simd16scalari vCompi = _simd16_castps_si(vComp); - vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin)); - vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax)); - vComp = _simd16_castsi_ps(vCompi); - } + r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits::fromFloat(Component))); + r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r)); } + return r; +} - return vComp; +template +INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component) +{ + return Normalize(vComp, Component); } -////////////////////////////////////////////////////////////////////////// -/// @brief Normalize the given component based on the requirements on the -/// Format template arg -/// @param vComp - SIMD vector of floats -/// @param Component - component template INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component) { - simd16scalar r = vComp; - if (FormatTraits::isNormalized(Component)) - { - r = _simd16_mul_ps(r, _simd16_set1_ps(FormatTraits::fromFloat(Component))); - r = _simd16_castsi_ps(_simd16_cvtps_epi32(r)); - } - return r; + return Normalize(vComp, Component); } ////////////////////////////////////////////////////////////////////////// @@ -311,8 +184,8 @@ INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Compo /// RGBA32_FLOAT to SOA format /// @param src - source data in SOA form /// @param dst - output data in SOA form -template -INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) +template +INLINE void SIMDCALL StoreSOA(const Vec4& src, uint8_t* pDst) { // fast path for float32 if ((FormatTraits::GetType(0) == SWR_TYPE_FLOAT) && @@ -320,7 +193,7 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) { for (uint32_t comp = 0; comp < FormatTraits::numComps; ++comp) { - simd16scalar vComp = src.v[FormatTraits::swizzle(comp)]; + Float vComp = src.v[FormatTraits::swizzle(comp)]; // Gamma-correct if (FormatTraits::isSRGB) @@ -331,13 +204,13 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) } } - _simd16_store_ps(reinterpret_cast(pDst + comp * sizeof(simd16scalar)), vComp); + SIMD_T::store_ps(reinterpret_cast(pDst + comp * sizeof(simd16scalar)), vComp); } return; } auto lambda = [&](int comp) { - simd16scalar vComp = src.v[FormatTraits::swizzle(comp)]; + Float vComp = src.v[FormatTraits::swizzle(comp)]; // Gamma-correct if (FormatTraits::isSRGB) @@ -349,10 +222,10 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) } // clamp - vComp = Clamp(vComp, comp); + vComp = Clamp(vComp, comp); // normalize - vComp = Normalize(vComp, comp); + vComp = Normalize(vComp, comp); // pack vComp = FormatTraits::pack(comp, vComp); @@ -360,10 +233,24 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) // store FormatTraits::storeSOA(comp, pDst, vComp); - pDst += (FormatTraits::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8; + // is there a better way to get this from the SIMD traits? + const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float); + + pDst += (FormatTraits::GetBPC(comp) * SIMD_WIDTH) / 8; }; UnrollerL<0, FormatTraits::numComps, 1>::step(lambda); } -#endif +template +INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst) +{ + StoreSOA(src, pDst); +} + +template +INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) +{ + StoreSOA(src, pDst); +} + diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index 518da82..7d7dd84 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -36,17 +36,17 @@ template struct PackTraits { - static const uint32_t MyNumBits = NumBits; + static const uint32_t MyNumBits = NumBits; + static simdscalar loadSOA(const uint8_t* pSrc) = delete; static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete; static simdscalar unpack(simdscalar& in) = delete; static simdscalar pack(simdscalar& in) = delete; -#if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; + + static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete; static simd16scalar unpack(simd16scalar& in) = delete; static simd16scalar pack(simd16scalar& in) = delete; -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -61,12 +61,11 @@ struct PackTraits<0, false> static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; } static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); } static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); } -#if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } + + static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; } static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); } static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -131,7 +130,6 @@ struct PackTraits<8, false> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -163,40 +161,31 @@ struct PackTraits<8, false> static simd16scalar pack(simd16scalar& in) { + // clang-format off + simd16scalari result = _simd16_setzero_si(); - simdscalari inlo = - _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF + simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) + simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - simdscalari permlo = - _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = - _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) + simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) + simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - simdscalari pack = _simd_packus_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) + simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) const simdscalari zero = _simd_setzero_si(); - permlo = _simd_permute2f128_si( - pack, - zero, - 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si( - pack, - zero, - 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) + permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) + permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - pack = _simd_packus_epi16(permlo, - permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 - // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) + pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) result = _simd16_insert_si(result, pack, 0); return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -262,7 +251,6 @@ struct PackTraits<8, true> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -294,40 +282,31 @@ struct PackTraits<8, true> static simd16scalar pack(simd16scalar& in) { + // clang-format off + simd16scalari result = _simd16_setzero_si(); - simdscalari inlo = - _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF + simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) + simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - simdscalari permlo = - _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = - _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) + simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) + simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - simdscalari pack = _simd_packs_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) + simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) const simdscalari zero = _simd_setzero_si(); - permlo = _simd_permute2f128_si( - pack, - zero, - 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si( - pack, - zero, - 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) + permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) + permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - pack = - _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 - // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) + pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) result = _simd16_insert_si(result, pack, 0); return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -391,7 +370,6 @@ struct PackTraits<16, false> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -418,24 +396,19 @@ struct PackTraits<16, false> static simd16scalar pack(simd16scalar& in) { + // clang-format off + const simd16scalari zero = _simd16_setzero_si(); - simd16scalari permlo = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 + simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) + simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - simd16scalari result = _simd16_packus_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 - // 00 00 00 00 00 00 00 00 00 (16b) + simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -500,7 +473,6 @@ struct PackTraits<16, true> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -527,24 +499,19 @@ struct PackTraits<16, true> static simd16scalar pack(simd16scalar& in) { + // clang-format off + const simd16scalari zero = _simd16_setzero_si(); - simd16scalari permlo = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 + simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) + simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - simd16scalari result = _simd16_packs_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 - // 00 00 00 00 00 00 00 00 00 (16b) + simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -562,7 +529,6 @@ struct PackTraits<32, false> } static simdscalar unpack(simdscalar& in) { return in; } static simdscalar pack(simdscalar& in) { return in; } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -577,7 +543,6 @@ struct PackTraits<32, false> static simd16scalar unpack(simd16scalar& in) { return in; } static simd16scalar pack(simd16scalar& in) { return in; } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -957,7 +922,6 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src) return Result; } -#if ENABLE_AVX512_SIMD16 template inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value) { @@ -1058,7 +1022,7 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) // only native AVX512 can directly use the computed mask for the blend operation result = _mm512_mask_blend_ps(mask, result2, result); #else - result = _simd16_blendv_ps( + result = _simd16_blendv_ps( result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); #endif } @@ -1066,7 +1030,6 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) return result; } -#endif ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for FLOAT16 ////////////////////////////////////////////////////////////////////////// @@ -1202,7 +1165,6 @@ struct TypeTraits : PackTraits<16> SWR_NOT_IMPL; // @todo return _simd_setzero_ps(); } -#if ENABLE_AVX512_SIMD16 static simd16scalar pack(const simd16scalar& in) { @@ -1235,7 +1197,6 @@ struct TypeTraits : PackTraits<16> SWR_NOT_IMPL; // @todo return _simd16_setzero_ps(); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -1263,10 +1224,8 @@ struct TypeTraits : PackTraits<32> #endif return in; } -#if ENABLE_AVX512_SIMD16 static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -1467,21 +1426,25 @@ struct ComponentTraits return TypeTraits::fromFloat(); } - INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc) + INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst) { switch (comp) { case 0: - return TypeTraits::loadSOA(pSrc); + dst = TypeTraits::loadSOA(pSrc); + return; case 1: - return TypeTraits::loadSOA(pSrc); + dst = TypeTraits::loadSOA(pSrc); + return; case 2: - return TypeTraits::loadSOA(pSrc); + dst = TypeTraits::loadSOA(pSrc); + return; case 3: - return TypeTraits::loadSOA(pSrc); + dst = TypeTraits::loadSOA(pSrc); + return; } SWR_INVALID("Invalid component: %d", comp); - return TypeTraits::loadSOA(pSrc); + dst = TypeTraits::loadSOA(pSrc); } INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src) @@ -1570,23 +1533,26 @@ struct ComponentTraits SWR_INVALID("Invalid component: %d", comp); return TypeTraits::convertSrgb(in); } -#if ENABLE_AVX512_SIMD16 - INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc) + INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst) { switch (comp) { case 0: - return TypeTraits::loadSOA_16(pSrc); + dst = TypeTraits::loadSOA_16(pSrc); + return; case 1: - return TypeTraits::loadSOA_16(pSrc); + dst = TypeTraits::loadSOA_16(pSrc); + return; case 2: - return TypeTraits::loadSOA_16(pSrc); + dst = TypeTraits::loadSOA_16(pSrc); + return; case 3: - return TypeTraits::loadSOA_16(pSrc); + dst = TypeTraits::loadSOA_16(pSrc); + return; } SWR_INVALID("Invalid component: %d", comp); - return TypeTraits::loadSOA_16(pSrc); + dst = TypeTraits::loadSOA_16(pSrc); } INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src) @@ -1660,5 +1626,4 @@ struct ComponentTraits SWR_INVALID("Invalid component: %d", comp); return TypeTraits::convertSrgb(in); } -#endif }; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h index b51755d..7c0b62f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h @@ -136,7 +136,6 @@ void vTranspose4x8(simd4scalar (&vDst)[8], vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); } -#if ENABLE_AVX512_SIMD16 INLINE void vTranspose4x16(simd16scalar (&dst)[4], const simd16scalar& src0, @@ -145,22 +144,9 @@ void vTranspose4x16(simd16scalar (&dst)[4], const simd16scalar& src3) { const simd16scalari perm = - _simd16_set_epi32(15, - 11, - 7, - 3, - 14, - 10, - 6, - 2, - 13, - 9, - 5, - 1, - 12, - 8, - 4, - 0); // pre-permute input to setup the right order after all the unpacking + _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + // pre-permute input to setup the right order after all the unpacking simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g @@ -178,7 +164,6 @@ void vTranspose4x16(simd16scalar (&dst)[4], dst[3] = _simd16_unpackhi_ps(rbhi, gahi); } -#endif INLINE void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar& vMask0, @@ -253,13 +238,11 @@ struct TransposeSingleComponent { memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -315,34 +298,35 @@ struct Transpose8_8_8_8 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simd4scalari src0 = - SIMD128::load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = - SIMD128::load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simd4scalari src2 = - SIMD128::load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb - simd4scalari src3 = - SIMD128::load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simd4scalari src0 = SIMD128::load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simd4scalari src1 = SIMD128::load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simd4scalari src2 = SIMD128::load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simd4scalari src3 = SIMD128::load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); - simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); + simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); - _simd16_store_si(reinterpret_cast(pDst), - dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba - } + _simd16_store_si(reinterpret_cast(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -355,10 +339,7 @@ struct Transpose8_8_8 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -383,14 +364,14 @@ struct Transpose8_8 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simd4scalari src0 = - SIMD128::load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = - SIMD128::load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simd4scalari src0 = SIMD128::load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simd4scalari src1 = SIMD128::load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg simdscalari cvt0 = _simd_cvtepu8_epi16(src0); simdscalari cvt1 = _simd_cvtepu8_epi16(src1); @@ -399,10 +380,13 @@ struct Transpose8_8 simdscalari dst = _simd_or_si(cvt0, shl1); - _simd_store_si(reinterpret_cast(pDst), - dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg - } + _simd_store_si(reinterpret_cast(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -436,10 +420,12 @@ struct Transpose32_32_32_32 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); @@ -449,12 +435,16 @@ struct Transpose32_32_32_32 vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); - } + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -487,10 +477,12 @@ struct Transpose32_32_32 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); simd16scalar src2 = _simd16_load_ps(reinterpret_cast(pSrc) + 32); @@ -500,12 +492,16 @@ struct Transpose32_32_32 vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst[0]); _simd16_store_ps(reinterpret_cast(pDst) + 16, dst[1]); _simd16_store_ps(reinterpret_cast(pDst) + 32, dst[2]); _simd16_store_ps(reinterpret_cast(pDst) + 48, dst[3]); - } + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -540,42 +536,32 @@ struct Transpose32_32 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalar src0 = - _simd16_load_ps(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simd16scalar src1 = - _simd16_load_ps(reinterpret_cast(pSrc) + 16); // gggggggggggggggg - - simd16scalar tmp0 = - _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD - simd16scalar tmp1 = - _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF - - simd16scalar per0 = _simd16_permute2f128_ps( - tmp0, - tmp1, - 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 - simd16scalar per1 = _simd16_permute2f128_ps( - tmp0, - tmp1, - 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF - - simd16scalar dst0 = _simd16_permute2f128_ps( - per0, - per0, - 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 - simd16scalar dst1 = _simd16_permute2f128_ps( - per1, - per1, - 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF - - _simd16_store_ps(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd16_store_ps(reinterpret_cast(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg - } +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simd16scalar src0 = _simd16_load_ps(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simd16scalar src1 = _simd16_load_ps(reinterpret_cast(pSrc) + 16); // gggggggggggggggg + + simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD + simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF + + simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 + simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF + + simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 + simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF + + _simd16_store_ps(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd16_store_ps(reinterpret_cast(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -616,44 +602,42 @@ struct Transpose16_16_16_16 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = - _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = - _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = - _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = - _simd_load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si( - tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si( - tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si( - tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si( - tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba - } +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_load_si(reinterpret_cast(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -693,43 +677,42 @@ struct Transpose16_16_16 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = - _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = - _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = - _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si( - tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si( - tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si( - tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si( - tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba - } +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast(pDst) + 3, dst3); // rgbargbargbargba + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -761,27 +744,29 @@ struct Transpose16_16 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = - _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = - _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off - simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari src0 = _simd_load_si(reinterpret_cast(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast(pSrc) + 1); // gggggggggggggggg - simdscalari dst0 = _simd_permute2f128_si( - tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 - simdscalari dst1 = _simd_permute2f128_si( - tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg - } + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 + simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + + _simd_store_si(reinterpret_cast(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -794,10 +779,7 @@ struct Transpose24_8 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -810,10 +792,7 @@ struct Transpose32_8_24 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -826,10 +805,7 @@ struct Transpose4_4_4_4 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -842,10 +818,7 @@ struct Transpose5_6_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -858,10 +831,7 @@ struct Transpose9_9_9_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -874,10 +844,7 @@ struct Transpose5_5_5_1 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -890,6 +857,7 @@ struct Transpose1_5_5_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -902,10 +870,7 @@ struct Transpose10_10_10_2 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -918,10 +883,7 @@ struct Transpose11_11_10 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -934,10 +896,7 @@ struct Transpose64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -950,10 +909,7 @@ struct Transpose64_64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -966,10 +922,7 @@ struct Transpose64_64_64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -982,8 +935,5 @@ struct Transpose64_64_64_64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index 8cccbf4..92fbf88 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -53,35 +53,22 @@ #if (KNOB_ARCH == KNOB_ARCH_AVX) #define KNOB_ARCH_ISA AVX #define KNOB_ARCH_STR "AVX" -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX2) #define KNOB_ARCH_ISA AVX2 #define KNOB_ARCH_STR "AVX2" -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX512) #define KNOB_ARCH_ISA AVX512F #define KNOB_ARCH_STR "AVX512" -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 #else #error "Unknown architecture" #endif -#if ENABLE_AVX512_SIMD16 +#define KNOB_SIMD_WIDTH 8 +#define KNOB_SIMD_BYTES 32 #define KNOB_SIMD16_WIDTH 16 #define KNOB_SIMD16_BYTES 64 -#if (KNOB_ARCH == KNOB_ARCH_AVX512) -#define ENABLE_AVX512_EMULATION 0 -#else -#define ENABLE_AVX512_EMULATION 1 -#endif - -#endif - #define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING") /////////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 04fad69..3f81232 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -201,14 +201,11 @@ struct simdvertex simdvector attrib[SWR_VTX_NUM_SLOTS]; }; -#if ENABLE_AVX512_SIMD16 struct simd16vertex { simd16vector attrib[SWR_VTX_NUM_SLOTS]; }; -#endif - template struct SIMDVERTEX_T { @@ -429,11 +426,12 @@ struct SWR_CS_CONTEXT // enums enum SWR_TILE_MODE { - SWR_TILE_NONE = 0x0, // Linear mode (no tiling) - SWR_TILE_MODE_WMAJOR, // W major tiling - SWR_TILE_MODE_XMAJOR, // X major tiling - SWR_TILE_MODE_YMAJOR, // Y major tiling - SWR_TILE_SWRZ, // SWR-Z tiling + SWR_TILE_NONE = 0x0, // Linear mode (no tiling) + SWR_TILE_MODE_WMAJOR, // W major tiling + SWR_TILE_MODE_XMAJOR, // X major tiling + SWR_TILE_MODE_YMAJOR, // Y major tiling + SWR_TILE_SWRZ, // SWR-Z tiling + SWR_TILE_MODE_COUNT }; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index e008cc8..9b48377 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -44,7 +44,6 @@ struct simdBBox simdscalari xmax; }; -#if ENABLE_AVX512_SIMD16 struct simd16BBox { simd16scalari ymin; @@ -52,7 +51,6 @@ struct simd16BBox simd16scalari xmin; simd16scalari xmax; }; -#endif template struct SIMDBBOX_T diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index 407cefa..02c6df0 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -355,7 +355,7 @@ struct ConvertPixelsSOAtoAOS StoreSOA(src, soaTile); // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose_16(soaTile, aosTile); + FormatTraits::TransposeT::Transpose_simd16(soaTile, aosTile); // Store data into destination StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); @@ -382,7 +382,7 @@ struct ConvertPixelsSOAtoAOS OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; // Convert from SOA --> AOS - FormatTraits::TransposeT::Transpose_16(pSrc, aosTile); + FormatTraits::TransposeT::Transpose_simd16(pSrc, aosTile); // Store data into destination StorePixels::bpp, NumDests>::Store(aosTile, ppDsts); diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h index abb0c53..cd29550 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h @@ -153,7 +153,6 @@ struct SimdTile } }; -#if ENABLE_AVX512_SIMD16 ////////////////////////////////////////////////////////////////////////// /// SimdTile 8x2 for AVX-512 ////////////////////////////////////////////////////////////////////////// @@ -253,7 +252,6 @@ struct SimdTile_16 } }; -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Computes lod offset for 1D surface at specified lod. /// @param baseWidth - width of basemip (mip 0). -- 2.7.4