Add "const" as appropriate in method/function signatures.
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
#define _simd_vmask_ps SIMD::vmask_ps
template<int mask> SIMDINLINE
-SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
+SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const &a, SIMD128::Integer const &b)
{
return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
}
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar const &vA, simdscalar const &vB, simdscalar const &vC, simdscalar const &vX, simdscalar const &vY)
{
simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
vOut = _simd_fmadd_ps(vB, vY, vOut);
//////////////////////////////////////////////////////////////////////////
/// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC, simd4scalar &vX, simd4scalar &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar const &vA, simd4scalar const &vB, simd4scalar const &vC, simd4scalar const &vX, simd4scalar const &vY)
{
simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
vOut = _simd128_fmadd_ps(vB, vY, vOut);
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar const &vI, simdscalar const &vJ, const float *pInterpBuffer)
{
const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
/// @param vJ - barycentric J
/// @param pInterpBuffer - pointer to attribute barycentric coeffs
template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const &vI, simd4scalar const &vJ, const float *pInterpBuffer)
{
const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
return vplaneps(vA, vB, vC, vI, vJ);
}
-static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const &a)
{
simd4scalari ai = SIMD128::castps_si(a);
return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
}
-static SIMDINLINE simdscalar _simd_abs_ps(simdscalar a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const &a)
{
simdscalari ai = _simd_castps_si(a);
return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
}
static SIMDINLINE
- void vec4_set1_vps(Vec4& r, Float s)
+ void vec4_set1_vps(Vec4& r, Float const &s)
{
r[0] = s;
r[1] = s;
}
static SIMDINLINE
- void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+ void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
{
r[0] = SIMD::mul_ps(v[0], s);
r[1] = SIMD::mul_ps(v[1], s);
}
static SIMDINLINE
- void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+ void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
{
r[0] = SIMD::add_ps(v0[0], s);
r[1] = SIMD::add_ps(v0[1], s);
}
static SIMDINLINE
- void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+ void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
{
r[0] = SIMD::min_ps(v0[0], s);
r[1] = SIMD::min_ps(v0[1], s);
}
static SIMDINLINE
- void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+ void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
{
r[0] = SIMD::max_ps(v0[0], s);
r[1] = SIMD::max_ps(v0[1], s);
//============================================================================
#define SIMD_WRAPPER_1(op) \
- static SIMDINLINE Float SIMDCALL op(Float a) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a) \
{\
return _mm256_##op(a);\
}
#define SIMD_WRAPPER_2(op) \
- static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_DWRAPPER_2(op) \
- static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
- static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
{\
return _mm256_##op(a, b, ImmT);\
}
#define SIMD_DWRAPPER_2I(op) \
template<int ImmT>\
- static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+ static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b) \
{\
return _mm256_##op(a, b, ImmT);\
}
#define SIMD_WRAPPER_3(op) \
- static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \
{\
return _mm256_##op(a, b, c);\
}
#define SIMD_IWRAPPER_1(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
{\
return _mm256_##op(a);\
}
#define SIMD_IWRAPPER_2(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_IFWRAPPER_2(op, intrin) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
}
#define SIMD_IFWRAPPER_2I(op, intrin) \
template<int ImmT> \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
}
#define SIMD_IWRAPPER_2I_(op, intrin) \
template<int ImmT>\
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return _mm256_##intrin(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
#define SIMD_IWRAPPER_3(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \
{\
return _mm256_##op(a, b, c);\
}
// emulated integer simd
#define SIMD_EMU_IWRAPPER_1(op) \
static SIMDINLINE \
- Integer SIMDCALL op(Integer a)\
+ Integer SIMDCALL op(Integer const &a)\
{\
return Integer\
{\
}
#define SIMD_EMU_IWRAPPER_1L(op, shift) \
static SIMDINLINE \
- Integer SIMDCALL op(Integer a)\
+ Integer SIMDCALL op(Integer const &a)\
{\
return Integer \
{\
};\
}\
static SIMDINLINE \
- Integer SIMDCALL op(SIMD128Impl::Integer a)\
+ Integer SIMDCALL op(SIMD128Impl::Integer const &a)\
{\
return Integer \
{\
#define SIMD_EMU_IWRAPPER_1I(op) \
template <int ImmT> static SIMDINLINE \
- Integer SIMDCALL op(Integer a)\
+ Integer SIMDCALL op(Integer const &a)\
{\
return Integer\
{\
#define SIMD_EMU_IWRAPPER_2(op) \
static SIMDINLINE \
- Integer SIMDCALL op(Integer a, Integer b)\
+ Integer SIMDCALL op(Integer const &a, Integer const &b)\
{\
return Integer\
{\
#define SIMD_EMU_IWRAPPER_2I(op) \
template <int ImmT> static SIMDINLINE \
- Integer SIMDCALL op(Integer a, Integer b)\
+ Integer SIMDCALL op(Integer const &a, Integer const &b)\
{\
return Integer\
{\
SIMD_WRAPPER_2(add_ps); // return a + b
SIMD_WRAPPER_2(div_ps); // return a / b
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c
{
return add_ps(mul_ps(a, b), c);
}
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float const &a, Float const &b, Float const &c) // return (a * b) - c
{
return sub_ps(mul_ps(a, b), c);
}
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
{
return _mm256_round_ps(a, static_cast<int>(RMT));
}
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
//-----------------------------------------------------------------------
SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b (uint32)
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const &vA, Integer const &vCount) // return a << b (uint32)
{
int32_t aHi, aLow, countHi, countLow;
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
SIMD_EMU_IWRAPPER_1I(srli_si); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b (uint32)
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const &vA, Integer const &vCount) // return a >> b (uint32)
{
int32_t aHi, aLow, countHi, countLow;
__m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a)
{
return _mm256_castpd_ps(a);
}
-static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a)
{
return _mm256_castps_si256(a);
}
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a)
{
return _mm256_castsi256_pd(a);
}
-static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a)
{
return _mm256_castps_pd(a);
}
-static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castpd_si(Double const &a) // return *(Integer*)(&a)
{
return _mm256_castpd_si256(a);
}
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a)
{
return _mm256_castsi256_ps(a);
}
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float)
{
return _mm256_cvtepi32_ps(a);
}
SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a (uint16 --> int64)
SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a (uint32 --> int64)
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32)
{
return _mm256_cvtps_epi32(a);
}
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return _mm256_cvttps_epi32(a);
}
// Comparison operations
//-----------------------------------------------------------------------
template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
{
return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
SIMD_EMU_IWRAPPER_2(cmpeq_epi8); // return a == b (int8)
SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != _mm256_testz_ps(a, b);
}
-static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != _mm256_testz_si256(a, b);
}
SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a (int32)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
}
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
{
return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
}
SIMD_EMU_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_EMU_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
Integer result;
return result;
}
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
Float result;
SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
template<int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult;
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
{
uint32_t *pOffsets = (uint32_t*)&idx;
Float vResult = old;
return vResult;
}
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
{
_mm256_maskstore_ps(p, mask, src);
}
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
{
return SIMD128T::movemask_epi8(a.v4[0]) |
(SIMD128T::movemask_epi8(a.v4[1]) << 16);
}
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
{
return static_cast<uint32_t>(_mm256_movemask_pd(a));
}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
{
return static_cast<uint32_t>(_mm256_movemask_ps(a));
}
return _mm256_setzero_si256();
}
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory)
{
_mm256_store_ps(p, a);
}
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a
{
_mm256_store_si256(&p->v, a);
}
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
_mm256_stream_ps(p, a);
}
}
template<int ImmT>
-static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const &a)
{
return _mm256_extractf128_pd(a, ImmT);
}
template<int ImmT>
-static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float a)
+static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const &a)
{
return _mm256_extractf128_ps(a, ImmT);
}
template<int ImmT>
-static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const &a)
{
return _mm256_extractf128_si256(a, ImmT);
}
template<int ImmT>
-static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double const &a, SIMD128Impl::Double const &b)
{
return _mm256_insertf128_pd(a, b, ImmT);
}
template<int ImmT>
-static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float const &a, SIMD128Impl::Float const &b)
{
return _mm256_insertf128_ps(a, b, ImmT);
}
template<int ImmT>
-static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl::Integer const &b)
{
return _mm256_insertf128_si256(a, b, ImmT);
}
return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
}
-static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer const &src)
{
_mm256_storeu2_m128i(&phi->v, &plo->v, src);
}
//============================================================================
#define SIMD_IWRAPPER_1(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
{\
return _mm256_##op(a);\
}
#define SIMD_IWRAPPER_1L(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
{\
return _mm256_##op(_mm256_castsi256_si128(a));\
}\
#define SIMD_IWRAPPER_1I(op) \
template<int ImmT> \
- static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
{\
return _mm256_##op(a, ImmT);\
}
#define SIMD_IWRAPPER_1I_(op, intrin) \
template<int ImmT> \
- static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
{\
return _mm256_##intrin(a, ImmT);\
}
#define SIMD_IWRAPPER_2_(op, intrin) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return _mm256_##intrin(a, b);\
}
#define SIMD_IWRAPPER_2(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return _mm256_##op(a, b);\
}
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT> \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return _mm256_##op(a, b, ImmT);\
}
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return _mm256_##op(a, b, ImmT);\
}
//-----------------------------------------------------------------------
// Floating point arithmetic operations
//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c
{
return _mm256_fmadd_ps(a, b, c);
}
SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
template<int ImmT> // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
{
return castsi_ps(srli_si<ImmT>(castps_si(a)));
}
SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
-static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b) // return a < b (int32)
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const &a, Integer const &b) // return a < b (int32)
{
return cmpgt_epi32(b, a);
}
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
return _mm256_permutevar8x32_ps(a, swiz);
}
SIMD_IWRAPPER_1I(shuffle_epi32);
template<int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
{
return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
}
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
}
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
{
// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
// Only for this intrinsic - not sure why. :(
return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
}
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
{
return static_cast<uint32_t>(_mm256_movemask_epi8(a));
}
using SIMD128T = SIMD128Impl::AVXImpl;
#define SIMD_WRAPPER_1(op) \
- static SIMDINLINE Float SIMDCALL op(Float a) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a) \
{\
return Float\
{\
}
#define SIMD_WRAPPER_2(op) \
- static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
{\
return Float\
{\
#define SIMD_WRAPPER_2I(op) \
template<int ImmT>\
- static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
{\
return Float\
{\
#define SIMD_WRAPPER_2I_1(op) \
template<int ImmT>\
- static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b) \
{\
return Float\
{\
}
#define SIMD_WRAPPER_3(op) \
- static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \
+ static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c) \
{\
return Float\
{\
}
#define SIMD_IWRAPPER_1(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a) \
{\
return Integer\
{\
}
#define SIMD_IWRAPPER_2(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return Integer\
{\
#define SIMD_IWRAPPER_2I(op) \
template<int ImmT>\
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return Integer\
{\
#define SIMD_IWRAPPER_2I_1(op) \
template<int ImmT>\
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return Integer\
{\
#define SIMD_IWRAPPER_2I_2(op) \
template<int ImmT>\
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b) \
{\
return Integer\
{\
}
#define SIMD_IWRAPPER_3(op) \
- static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c) \
+ static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c) \
{\
return Integer\
{\
SIMD_WRAPPER_2(sub_ps); // return a - b
template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
{
return Float
{
};
}
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
//-----------------------------------------------------------------------
// Integer (various width) arithmetic operations
// Shift operations
//-----------------------------------------------------------------------
template<int ImmT>
-static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a) // return a << ImmT
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a) // return a << ImmT
{
return Integer
{
SIMD_IWRAPPER_2(sllv_epi32); // return a << b (uint32)
template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a) // return a >> ImmT (int32)
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a) // return a >> ImmT (int32)
{
return Integer
{
}
template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a) // return a >> ImmT (uint32)
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a) // return a >> ImmT (uint32)
{
return Integer
{
}
template<int ImmT> // for each 128-bit lane:
-static SIMDINLINE Integer SIMDCALL srli_si(Integer a) // return a >> (ImmT*8) (uint)
+static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a) // return a >> (ImmT*8) (uint)
{
return Integer
{
};
}
template<int ImmT>
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float a) // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a) // same as srli_si, but with Float cast to int
{
return Float
{
//-----------------------------------------------------------------------
// Conversion operations
//-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a) // return *(Float*)(&a)
{
return Float
{
};
}
-static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const &a) // return *(Integer*)(&a)
{
return Integer
{
};
}
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a) // return *(Double*)(&a)
{
return Double
{
};
}
-static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const &a) // return *(Double*)(&a)
{
return Double
{
};
}
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a) // return *(Float*)(&a)
{
return Float
{
};
}
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a (int32 --> float)
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a (int32 --> float)
{
return Float
{
};
}
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a) // return (int16)a (uint8 --> int16)
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a) // return (int16)a (uint8 --> int16)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a) // return (int32)a (uint8 --> int32)
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint8 --> int32)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a) // return (int32)a (uint16 --> int32)
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a) // return (int32)a (uint16 --> int32)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a) // return (int64)a (uint16 --> int64)
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint16 --> int64)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a) // return (int64)a (uint32 --> int64)
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a) // return (int64)a (uint32 --> int64)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a (float --> int32)
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a) // return (int32)a (float --> int32)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a) // return (int32)a (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a) // return (int32)a (rnd_to_zero(float) --> int32)
{
return Integer
{
// Comparison operations
//-----------------------------------------------------------------------
template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
{
return Float
{
SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
};
}
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
template<CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
{
return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
}
SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
-static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
{
return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
SIMD256T::testz_ps(a.v8[1], b.v8[1]));
}
-static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE int SIMDCALL testz_si(Integer const &a, Integer const &b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
{
return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
SIMD256T::testz_si(a.v8[1], b.v8[1]));
SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a (float)
SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a (int32)
SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
{
return Integer
{
};
}
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
{
return Integer
{
}
template<int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template<int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template<int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
return a.v8[imm];
}
template<int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
- a.v8[imm] = b;
- return a;
+ Float r = a;
+ r.v8[imm] = b;
+ return r;
}
template<int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
- a.v8[imm] = b;
- return a;
+ Double r = a;
+ r.v8[imm] = b;
+ return r;
}
template<int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
{
SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
- a.v8[imm] = b;
- return a;
+ Integer r = a;
+ r.v8[imm] = b;
+ return r;
}
SIMD_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
{
Integer result;
return result;
}
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (float)
{
Float result;
// AVX instructions for emulation.
//
template <int shuf>
-static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
{
return Float
{
}
template <int shuf>
-static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
{
return Double
{
}
template <int shuf>
-static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
{
return Integer
{
// Load / store operations
//-----------------------------------------------------------------------
template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
{
return Float
{
// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
{
return Float
{
};
}
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
{
SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
}
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
{
uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
return mask;
}
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
{
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
return mask;
}
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
{
uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
};
}
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a) // *p = a (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a) // *p = a (stores all elements contiguously in memory)
{
SIMD256T::store_ps(p, a.v8[0]);
SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
}
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a) // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a) // *p = a
{
SIMD256T::store_si(&p->v8[0], a.v8[0]);
SIMD256T::store_si(&p->v8[1], a.v8[1]);
}
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a) // *p = a (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a) // *p = a (same as store_ps, but doesn't keep memory in cache)
{
SIMD256T::stream_ps(p, a.v8[0]);
SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
Float z;
Float w;
};
- SIMDINLINE Float& operator[] (const int i) { return v[i]; }
- SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
- SIMDINLINE Vec4& operator=(Vec4 const & in)
+ SIMDINLINE Float& SIMDCALL operator[] (const int i) { return v[i]; }
+ SIMDINLINE Float const & SIMDCALL operator[] (const int i) const { return v[i]; }
+ SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const & in)
{
v[0] = in.v[0];
v[1] = in.v[1];
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m128 in) : v(in) {}
- SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
- SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
- SIMDINLINE operator __m128() const { return v; }
+ SIMDINLINE Float& SIMDCALL operator=(__m128 in) { v = in; return *this; }
+ SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; }
+ SIMDINLINE SIMDCALL operator __m128() const { return v; }
SIMDALIGN(__m128, 16) v;
};
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m128i in) : v(in) {}
- SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
- SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
- SIMDINLINE operator __m128i() const { return v; }
+ SIMDINLINE Integer& SIMDCALL operator=(__m128i in) { v = in; return *this; }
+ SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; }
+ SIMDINLINE SIMDCALL operator __m128i() const { return v; }
+
SIMDALIGN(__m128i, 16) v;
};
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m128d in) : v(in) {}
- SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
- SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
- SIMDINLINE operator __m128d() const { return v; }
+ SIMDINLINE Double& SIMDCALL operator=(__m128d in) { v = in; return *this; }
+ SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; }
+ SIMDINLINE SIMDCALL operator __m128d() const { return v; }
+
SIMDALIGN(__m128d, 16) v;
};
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m256 in) : v(in) {}
- SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+ SIMDINLINE Float(SIMD128Impl::Float const &in_lo, SIMD128Impl::Float const &in_hi = _mm_setzero_ps())
{
v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
}
- SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
- SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
- SIMDINLINE operator __m256() const { return v; }
+ SIMDINLINE Float& SIMDCALL operator=(__m256 in) { v = in; return *this; }
+ SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; }
+ SIMDINLINE SIMDCALL operator __m256() const { return v; }
SIMDALIGN(__m256, 32) v;
SIMD128Impl::Float v4[2];
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m256i in) : v(in) {}
- SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+ SIMDINLINE Integer(SIMD128Impl::Integer const &in_lo, SIMD128Impl::Integer const &in_hi = _mm_setzero_si128())
{
v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
}
- SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
- SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
- SIMDINLINE operator __m256i() const { return v; }
+ SIMDINLINE Integer& SIMDCALL operator=(__m256i in) { v = in; return *this; }
+ SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; }
+ SIMDINLINE SIMDCALL operator __m256i() const { return v; }
SIMDALIGN(__m256i, 32) v;
SIMD128Impl::Integer v4[2];
union Double
{
SIMDINLINE Double() = default;
- SIMDINLINE Double(__m256d in) : v(in) {}
- SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+ SIMDINLINE Double(__m256d const &in) : v(in) {}
+ SIMDINLINE Double(SIMD128Impl::Double const &in_lo, SIMD128Impl::Double const &in_hi = _mm_setzero_pd())
{
v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
}
- SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
- SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
- SIMDINLINE operator __m256d() const { return v; }
+ SIMDINLINE Double& SIMDCALL operator=(__m256d in) { v = in; return *this; }
+ SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; }
+ SIMDINLINE SIMDCALL operator __m256d() const { return v; }
SIMDALIGN(__m256d, 32) v;
SIMD128Impl::Double v4[2];
{
SIMDINLINE Float() = default;
SIMDINLINE Float(__m512 in) : v(in) {}
- SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
- SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
- SIMDINLINE Float& operator=(Float const & in)
+ SIMDINLINE Float(SIMD256Impl::Float const &in_lo, SIMD256Impl::Float const &in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+ SIMDINLINE Float& SIMDCALL operator=(__m512 in) { v = in; return *this; }
+ SIMDINLINE Float& SIMDCALL operator=(Float const & in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
#endif
return *this;
}
- SIMDINLINE operator __m512() const { return v; }
+ SIMDINLINE SIMDCALL operator __m512() const { return v; }
SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Float v8[2];
{
SIMDINLINE Integer() = default;
SIMDINLINE Integer(__m512i in) : v(in) {}
- SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
- SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
- SIMDINLINE Integer& operator=(Integer const & in)
+ SIMDINLINE Integer(SIMD256Impl::Integer const &in_lo, SIMD256Impl::Integer const &in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+ SIMDINLINE Integer& SIMDCALL operator=(__m512i in) { v = in; return *this; }
+ SIMDINLINE Integer& SIMDCALL operator=(Integer const & in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
return *this;
}
- SIMDINLINE operator __m512i() const { return v; }
+ SIMDINLINE SIMDCALL operator __m512i() const { return v; }
SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Integer v8[2];
{
SIMDINLINE Double() = default;
SIMDINLINE Double(__m512d in) : v(in) {}
- SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
- SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
- SIMDINLINE Double& operator=(Double const & in)
+ SIMDINLINE Double(SIMD256Impl::Double const &in_lo, SIMD256Impl::Double const &in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+ SIMDINLINE Double& SIMDCALL operator=(__m512d in) { v = in; return *this; }
+ SIMDINLINE Double& SIMDCALL operator=(Double const & in)
{
#if SIMD_ARCH >= SIMD_ARCH_AVX512
v = in.v;
return *this;
}
- SIMDINLINE operator __m512d() const { return v; }
+ SIMDINLINE SIMDCALL operator __m512d() const { return v; }
SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
SIMD256Impl::Double v8[2];
#define MASK 0xff
#endif
-static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
{
simdscalar vClipMask = _simd_setzero_ps();
uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
template<typename T>
INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
const uint64_t *const coverageMask, const uint32_t sampleMask,
- const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+ simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
{
uint32_t inputMask[KNOB_SIMD_WIDTH];
generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
}
INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
- const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+ const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
{
// evaluate I,J
psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
}
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar z, float minz, float maxz)
+INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
{
const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
// Merge Output to 4x2 SIMD Tile Format
INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
- const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask)
+ const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask)
{
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
#if USE_8x2_TILE_BACKEND
// Merge Output to 8x2 SIMD16 Tile Format
INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
- const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
+ const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset)
{
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
#include "tilemgr.h"
// Function Prototype
-void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
-void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+void BinPostSetupLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], simdscalar vRecipW[2], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx);
+void BinPostSetupPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx);
#if USE_SIMD16_FRONTEND
-void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
-void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void BinPostSetupLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], simd16scalar vRecipW[2], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx);
+void BinPostSetupPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx);
#endif
//////////////////////////////////////////////////////////////////////////
uint32_t workerId,
simdvector tri[3],
uint32_t triMask,
- simdscalari primID)
+ simdscalari const &primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t workerId,
simd16vector tri[3],
uint32_t triMask,
- simd16scalari primID)
+ simd16scalari const &primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t workerId,
simdvector prim[],
uint32_t primMask,
- simdscalari primID,
- simdscalari viewportIdx)
+ simdscalari const &primID,
+ simdscalari const &viewportIdx)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t workerId,
simdvector prim[3],
uint32_t primMask,
- simdscalari primID)
+ simdscalari const &primID)
{
simdvector& primVerts = prim[0];
uint32_t workerId,
simd16vector prim[],
uint32_t primMask,
- simd16scalari primID,
- simd16scalari viewportIdx)
+ simd16scalari const &primID,
+ simd16scalari const &viewportIdx)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t workerId,
simd16vector prim[3],
uint32_t primMask,
- simd16scalari primID)
+ simd16scalari const &primID)
{
simd16vector& primVerts = prim[0];
simdvector prim[],
simdscalar recipW[],
uint32_t primMask,
- simdscalari primID,
- simdscalari viewportIdx)
+ simdscalari const &primID,
+ simdscalari const &viewportIdx)
{
SWR_CONTEXT *pContext = pDC->pContext;
simd16vector prim[],
simd16scalar recipW[],
uint32_t primMask,
- simd16scalari primID,
- simd16scalari viewportIdx)
+ simd16scalari const &primID,
+ simd16scalari const &viewportIdx)
{
SWR_CONTEXT *pContext = pDC->pContext;
uint32_t workerId,
simdvector prim[],
uint32_t primMask,
- simdscalari primID)
+ simdscalari const &primID)
{
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
uint32_t workerId,
simd16vector prim[3],
uint32_t primMask,
- simd16scalari primID)
+ simd16scalari const &primID)
{
const API_STATE& state = GetApiState(pDC);
const SWR_RASTSTATE& rastState = state.rastState;
/// @brief Convert the X,Y coords of a triangle to the requested Fixed
/// Point precision from FP32.
template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
+INLINE simdscalari fpToFixedPointVertical(const simdscalar &vIn)
{
simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
return _simd_cvtps_epi32(vFixed);
#if USE_SIMD16_FRONTEND
template <typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE simd16scalari fpToFixedPointVertical(const simd16scalar vIn)
+INLINE simd16scalari fpToFixedPointVertical(const simd16scalar &vIn)
{
simd16scalar vFixed = _simd16_mul_ps(vIn, _simd16_set1_ps(PT::ScaleT::value));
return _simd16_cvtps_epi32(vFixed);
return i;
}
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipTriangles, pDC->drawId);
AR_END(FEClipTriangles, 1);
}
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipLines, pDC->drawId);
AR_END(FEClipLines, 1);
}
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipPoints, pDC->drawId);
}
#if USE_SIMD16_FRONTEND
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipTriangles, pDC->drawId);
AR_END(FEClipTriangles, 1);
}
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipLines, pDC->drawId);
AR_END(FEClipLines, 1);
}
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId)
{
SWR_CONTEXT *pContext = pDC->pContext;
AR_BEGIN(FEClipPoints, pDC->drawId);
#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
INLINE
-void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari viewportIndexes)
+void ComputeClipCodes(const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes, simdscalari const &viewportIndexes)
{
clipCodes = _simd_setzero_ps();
#if USE_SIMD16_FRONTEND
INLINE
-void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari viewportIndexes)
+void ComputeClipCodes(const API_STATE& state, const simd16vector& vertex, simd16scalar& clipCodes, simd16scalari const &viewportIndexes)
{
clipCodes = _simd16_setzero_ps();
class Clipper
{
public:
- Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
+ INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
{
static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
}
- void ComputeClipCodes(simdvector vertex[], simdscalari viewportIndexes)
+ INLINE void ComputeClipCodes(simdvector vertex[], simdscalari const &viewportIndexes)
{
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
}
#if USE_SIMD16_FRONTEND
- void ComputeClipCodes(simd16vector vertex[], simd16scalari viewportIndexes)
+ INLINE void ComputeClipCodes(simd16vector vertex[], simd16scalari const &viewportIndexes)
{
for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
{
}
#endif
- simdscalar ComputeClipCodeIntersection()
+ INLINE simdscalar ComputeClipCodeIntersection()
{
simdscalar result = this->clipCodes[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
}
#if USE_SIMD16_FRONTEND
- simd16scalar ComputeClipCodeIntersection_simd16()
+ INLINE simd16scalar ComputeClipCodeIntersection_simd16()
{
simd16scalar result = this->clipCodes_simd16[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
}
#endif
- simdscalar ComputeClipCodeUnion()
+ INLINE simdscalar ComputeClipCodeUnion()
{
simdscalar result = this->clipCodes[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
}
#if USE_SIMD16_FRONTEND
- simd16scalar ComputeClipCodeUnion_simd16()
+ INLINE simd16scalar ComputeClipCodeUnion_simd16()
{
simd16scalar result = this->clipCodes_simd16[0];
for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
}
#endif
- int ComputeNegWMask()
+ INLINE int ComputeNegWMask()
{
simdscalar clipCodeUnion = ComputeClipCodeUnion();
clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
}
- int ComputeClipMask()
+ INLINE int ComputeClipMask()
{
simdscalar clipUnion = ComputeClipCodeUnion();
clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
}
#if USE_SIMD16_FRONTEND
- int ComputeClipMask_simd16()
+ INLINE int ComputeClipMask_simd16()
{
simd16scalar clipUnion = ComputeClipCodeUnion_simd16();
clipUnion = _simd16_and_ps(clipUnion, _simd16_castsi_ps(_simd16_set1_epi32(GUARDBAND_CLIP_MASK)));
#endif
// clipper is responsible for culling any prims with NAN coordinates
- int ComputeNaNMask(simdvector prim[])
+ INLINE int ComputeNaNMask(simdvector prim[])
{
simdscalar vNanMask = _simd_setzero_ps();
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
}
#if USE_SIMD16_FRONTEND
- int ComputeNaNMask(simd16vector prim[])
+ INLINE int ComputeNaNMask(simd16vector prim[])
{
simd16scalar vNanMask = _simd16_setzero_ps();
for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
}
#endif
- int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
+ INLINE int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
{
uint8_t cullMask = this->state.rastState.cullDistanceMask;
simdscalar vClipCullMask = _simd_setzero_ps();
}
#if USE_SIMD16_FRONTEND
- int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
+ INLINE int ComputeUserClipCullMask(PA_STATE& pa, simd16vector prim[])
{
uint8_t cullMask = this->state.rastState.cullDistanceMask;
simd16scalar vClipCullMask = _simd16_setzero_ps();
#endif
// clip SIMD primitives
- void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+ INLINE void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
{
// input/output vertex store for clipper
simdvertex vertices[7]; // maximum 7 verts generated per triangle
#endif
// execute the clipper stage
- void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
+ void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari const &primId)
{
SWR_ASSERT(this->pDC != nullptr);
SWR_CONTEXT* pContext = this->pDC->pContext;
}
#if USE_SIMD16_FRONTEND
- void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId)
+ void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari const &primId)
{
SWR_ASSERT(pa.pDC != nullptr);
SWR_CONTEXT* pContext = pa.pDC->pContext;
#endif
private:
- inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
+ inline simdscalar ComputeInterpFactor(simdscalar const &boundaryCoord0, simdscalar const &boundaryCoord1)
{
return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
}
#if USE_SIMD16_FRONTEND
- inline simd16scalar ComputeInterpFactor(simd16scalar boundaryCoord0, simd16scalar boundaryCoord1)
+ inline simd16scalar ComputeInterpFactor(simd16scalar const &boundaryCoord0, simd16scalar const &boundaryCoord1)
{
return _simd16_div_ps(boundaryCoord0, _simd16_sub_ps(boundaryCoord0, boundaryCoord1));
}
#endif
- inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
+ inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari const &vIndices, uint32_t component)
{
const uint32_t simdVertexStride = sizeof(simdvertex);
const uint32_t componentStride = sizeof(simdscalar);
}
#if USE_SIMD16_FRONTEND
- inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari vIndices, uint32_t component)
+ inline simd16scalari ComputeOffsets(uint32_t attrib, simd16scalari const &vIndices, uint32_t component)
{
const uint32_t simdVertexStride = sizeof(simd16vertex);
const uint32_t componentStride = sizeof(simd16scalar);
#endif
// gathers a single component for a given attribute for each SIMD lane
- inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
+ inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar const &vMask, simdscalari const &vIndices, uint32_t component)
{
simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
simdscalar vSrc = _mm256_undefined_ps();
}
#if USE_SIMD16_FRONTEND
- inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component)
+ inline simd16scalar GatherComponent(const float* pBuffer, uint32_t attrib, simd16scalar const &vMask, simd16scalari const &vIndices, uint32_t component)
{
simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
simd16scalar vSrc = _simd16_setzero_ps();
}
#endif
- inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
+ inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar const &vMask, simdscalari const &vIndices, uint32_t component, simdscalar const &vSrc)
{
simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
}
#if USE_SIMD16_FRONTEND
- inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar vMask, simd16scalari vIndices, uint32_t component, simd16scalar vSrc)
+ inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simd16scalar const &vMask, simd16scalari const &vIndices, uint32_t component, simd16scalar const &vSrc)
{
simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
// pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId);
#if USE_SIMD16_FRONTEND
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId);
#endif
// function signature for pipeline stages that execute after primitive assembly
typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
- uint32_t primMask, simdscalari primID);
+ uint32_t primMask, simdscalari const &primID);
#if ENABLE_AVX512_SIMD16
// function signature for pipeline stages that execute after primitive assembly
typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
- uint32_t primMask, simd16scalari primID);
+ uint32_t primMask, simd16scalari const &primID);
#endif
OSALIGNLINE(struct) API_STATE
// pipeline function pointer types
typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
- const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
+ const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &);
typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
- const simdscalar, const simdscalar);
+ simdscalar const &, simdscalar const &);
struct BACKEND_FUNCS
{
#include "format_conversion.h"
INLINE
-void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps)
+void StencilOp(SWR_STENCILOP op, simdscalar const &mask, simdscalar const &stencilRefps, simdscalar &stencilps)
{
simdscalari stencil = _simd_castps_si(stencilps);
template<SWR_FORMAT depthFormatT>
-simdscalar QuantizeDepth(simdscalar depth)
+simdscalar QuantizeDepth(simdscalar const &depth)
{
SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
INLINE
simdscalar DepthStencilTest(const API_STATE* pState,
- bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask,
+ bool frontFacing, uint32_t viewportIndex, simdscalar const &iZ, uint8_t* pDepthBase, simdscalar const &coverageMask,
uint8_t *pStencilBase, simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
// clamp Z to viewport [minZ..maxZ]
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
- interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+ simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
if (pDSState->depthTestEnable)
{
INLINE
void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
- bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
+ bool frontFacing, simdscalar const &iZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
uint8_t *pStencilBase, const simdscalar& stencilMask)
{
if (pDSState->depthWriteEnable)
// clamp Z to viewport [minZ..maxZ]
simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
- interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+ simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
_simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
-INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component)
+INLINE simdscalar Clamp(simdscalar const &vC, uint32_t Component)
{
+ simdscalar vComp = vC;
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
-INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
+INLINE simdscalar Normalize(simdscalar const &vC, uint32_t Component)
{
+ simdscalar vComp = vC;
if (FormatTraits<Format>::isNormalized(Component))
{
vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar const &v, uint32_t Component)
{
+ simd16scalar vComp = v;
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
/// @param vComp - SIMD vector of floats
/// @param Component - component
template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar const &vComp, uint32_t Component)
{
+ simd16scalar r = vComp;
if (FormatTraits<Format>::isNormalized(Component))
{
- vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component)));
- vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
+ r = _simd16_mul_ps(r, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component)));
+ r = _simd16_castsi_ps(_simd16_cvtps_epi32(r));
}
- return vComp;
+ return r;
}
//////////////////////////////////////////////////////////////////////////
{
static const uint32_t MyNumBits = NumBits;
static simdscalar loadSOA(const uint8_t *pSrc) = delete;
- static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
+ static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete;
static simdscalar unpack(simdscalar &in) = delete;
static simdscalar pack(simdscalar &in) = delete;
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete;
static simd16scalar unpack(simd16scalar &in) = delete;
static simd16scalar pack(simd16scalar &in) = delete;
#endif
static const uint32_t MyNumBits = 0;
static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
- static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
+ static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; }
static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; }
static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
#endif
#endif
}
- static void storeSOA(uint8_t *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar const &src)
{
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
return result;
}
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
{
// store simd16 bytes
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
#endif
}
- static void storeSOA(uint8_t *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar const &src)
{
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
return result;
}
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
{
// store simd16 bytes
_mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
#endif
}
- static void storeSOA(uint8_t *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar const &src)
{
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
return result;
}
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
{
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
}
#endif
}
- static void storeSOA(uint8_t *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar const &src)
{
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
return result;
}
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
{
_simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
}
static const uint32_t MyNumBits = 32;
static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
- static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+ static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); }
static simdscalar unpack(simdscalar &in) { return in; }
static simdscalar pack(simdscalar &in) { return in; }
#if ENABLE_AVX512_SIMD16
return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
}
- static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
+ static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
{
_simd16_store_ps(reinterpret_cast<float *>(pDst), src);
}
#if ENABLE_AVX512_SIMD16
template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
+inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value)
{
static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
* powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
return result;
}
-inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg)
{
// 5/12 is too small, so compute the 4th root of 20/12 instead.
// 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
return xavg;
}
-inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp)
{
const float *f = reinterpret_cast<const float *>(&base);
return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
}
- INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
+ INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src)
{
switch (comp)
{
return;
}
SWR_INVALID("Invalid component: %d", comp);
- TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
}
INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
{
+ simdscalar out;
switch (comp)
{
case 0:
- return TypeTraits<X, NumBitsX>::unpack(in);
+ out = TypeTraits<X, NumBitsX>::unpack(in); break;
case 1:
- return TypeTraits<Y, NumBitsY>::unpack(in);
+ out = TypeTraits<Y, NumBitsY>::unpack(in); break;
case 2:
- return TypeTraits<Z, NumBitsZ>::unpack(in);
+ out = TypeTraits<Z, NumBitsZ>::unpack(in); break;
case 3:
- return TypeTraits<W, NumBitsW>::unpack(in);
+ out = TypeTraits<W, NumBitsW>::unpack(in); break;
+ default:
+ SWR_INVALID("Invalid component: %d", comp);
+ out = in;
+ break;
}
- SWR_INVALID("Invalid component: %d", comp);
- return TypeTraits<X, NumBitsX>::unpack(in);
+ return out;
}
INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
{
+ simdscalar out;
switch (comp)
{
case 0:
- return TypeTraits<X, NumBitsX>::pack(in);
+ out = TypeTraits<X, NumBitsX>::pack(in); break;
case 1:
- return TypeTraits<Y, NumBitsY>::pack(in);
+ out = TypeTraits<Y, NumBitsY>::pack(in); break;
case 2:
- return TypeTraits<Z, NumBitsZ>::pack(in);
+ out = TypeTraits<Z, NumBitsZ>::pack(in); break;
case 3:
- return TypeTraits<W, NumBitsW>::pack(in);
+ out = TypeTraits<W, NumBitsW>::pack(in); break;
+ default:
+ SWR_INVALID("Invalid component: %d", comp);
+ out = in;
+ break;
}
- SWR_INVALID("Invalid component: %d", comp);
- return TypeTraits<X, NumBitsX>::pack(in);
+ return out;
}
INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
}
- INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
+ INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src)
{
switch (comp)
{
#if USE_SIMD16_FRONTEND
uint32_t numPrims_simd8,
#endif
- simdscalari primID)
+ simdscalari const &primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
#if USE_SIMD16_FRONTEND
uint32_t numPrims_simd8,
#endif
- simdscalari primID)
+ simdscalari const &primID)
{
SWR_CONTEXT *pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
#endif
template<uint32_t NumVerts>
INLINE
-void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx)
{
// perform a gather of each matrix element based on the viewport array indexes
simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
#if USE_SIMD16_FRONTEND
template<uint32_t NumVerts>
INLINE
-void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari vViewportIdx)
+void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx)
{
// perform a gather of each matrix element based on the viewport array indexes
const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
#endif
struct PA_STATE_BASE; // forward decl
-void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
-void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID);
#if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
-void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID);
#endif
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*,
simdvector& vSrc, simdvector& vSrc1, simdscalar& vSrc0Alpha, uint32_t sample,
uint8_t* pDst, simdvector& vResult, simdscalari* vOMask, simdscalari* vCoverageMask);
-typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar);
+typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &);