From 005d937e1533521e87f0119c400298c02f365bf1 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 8 Nov 2017 19:17:24 -0600 Subject: [PATCH] swr/rast: Implement AVX-512 GATHERPS in SIMD16 fetch shader Disabled for now. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 + .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 126 +++++++++++++++++++-- .../drivers/swr/rasterizer/jitter/builder_misc.h | 31 ++++- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 91 ++++++++++++--- 4 files changed, 220 insertions(+), 29 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index ce892a9..44fc857 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -44,6 +44,7 @@ inst_aliases = { intrinsics = [ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']], + ['VGATHERPS2', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']], ['VPSRLI', 'x86_avx2_psrli_d', ['src', 'imm']], ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index bd3a525..8ffe05b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -211,6 +211,28 @@ namespace SwrJit return ConstantVector::getSplat(mVWidth, cast(C(i))); } +#if USE_SIMD16_BUILDER + Value *Builder::VIMMED2_1(int i) + { + return ConstantVector::getSplat(mVWidth2, cast(C(i))); + } + + Value *Builder::VIMMED2_1(uint32_t i) + { + return ConstantVector::getSplat(mVWidth2, cast(C(i))); + } + + Value *Builder::VIMMED2_1(float i) + { + return ConstantVector::getSplat(mVWidth2, cast(C(i))); + } + + Value *Builder::VIMMED2_1(bool i) + { + return ConstantVector::getSplat(mVWidth2, cast(C(i))); + } + +#endif Value *Builder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); @@ -237,6 +259,11 @@ namespace SwrJit return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2)); } + Value *Builder::VUNDEF2_I() + { + return UndefValue::get(VectorType::get(mInt32Ty, mVWidth2)); + } + #endif Value *Builder::VUNDEF(Type* t) { @@ -254,6 +281,19 @@ namespace SwrJit return VECTOR_SPLAT(mVWidth, src); } +#if USE_SIMD16_BUILDER + Value *Builder::VBROADCAST2(Value *src) + { + // check if src is already a vector + if (src->getType()->isVectorTy()) + { + return src; + } + + return VECTOR_SPLAT(mVWidth2, src); + } + +#endif uint32_t Builder::IMMED(Value* v) { SWR_ASSERT(isa(v)); @@ -554,16 +594,17 @@ namespace SwrJit /// @param vIndices - SIMD wide value of VB byte offsets /// @param vMask - SIMD wide mask that controls whether to access memory or the src values /// @param scale - value to scale indices by - Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale) + Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) { - Value* vGather; + Value *vGather; // use avx2 gather instruction if available if(JM()->mArch.AVX2()) { // force mask to , required by vgather - vMask = BITCAST(vMask, mSimdFP32Ty); - vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,C(scale)); + Value *mask = BITCAST(vMask, mSimdFP32Ty); + + vGather = VGATHERPS(vSrc, pBase, vIndices, mask, C(scale)); } else { @@ -598,6 +639,41 @@ namespace SwrJit return vGather; } +#if USE_SIMD16_BUILDER + Value *Builder::GATHERPS2(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale) + { + Value *vGather = VUNDEF2_F(); + + // use avx512 gather instruction if available + if (JM()->mArch.AVX512F()) + { + // force mask to , required by vgather2 + Value *mask = BITCAST(MASK2(vMask), mInt16Ty); + + vGather = VGATHERPS2(vSrc, pBase, vIndices, mask, C((uint32_t)scale)); + } + else + { + Value *src0 = EXTRACT2_F(vSrc, 0); + Value *src1 = EXTRACT2_F(vSrc, 1); + + Value *indices0 = EXTRACT2_I(vIndices, 0); + Value *indices1 = EXTRACT2_I(vIndices, 1); + + Value *mask0 = EXTRACT2_I(vMask, 0); + Value *mask1 = EXTRACT2_I(vMask, 1); + + Value *gather0 = GATHERPS(src0, pBase, indices0, mask0, scale); + Value *gather1 = GATHERPS(src1, pBase, indices1, mask1, scale); + + vGather = INSERT2_F(vGather, gather0, 0); + vGather = INSERT2_F(vGather, gather1, 1); + } + + return vGather; + } + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads @@ -700,7 +776,7 @@ namespace SwrJit #if USE_SIMD16_BUILDER ////////////////////////////////////////////////////////////////////////// /// @brief - Value *Builder::EXTRACT(Value *a2, uint32_t imm) + Value *Builder::EXTRACT2_F(Value *a2, uint32_t imm) { const uint32_t i0 = (imm > 0) ? mVWidth : 0; @@ -708,6 +784,13 @@ namespace SwrJit for (uint32_t i = 0; i < mVWidth; i += 1) { +#if 1 + if (!a2->getType()->getScalarType()->isFloatTy()) + { + a2 = BITCAST(a2, mSimd2FP32Ty); + } + +#endif Value *temp = VEXTRACT(a2, C(i0 + i)); result = VINSERT(result, temp, C(i)); @@ -716,9 +799,14 @@ namespace SwrJit return result; } + Value *Builder::EXTRACT2_I(Value *a2, uint32_t imm) + { + return BITCAST(EXTRACT2_F(a2, imm), mSimdInt32Ty); + } + ////////////////////////////////////////////////////////////////////////// /// @brief - Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm) + Value *Builder::INSERT2_F(Value *a2, Value *b, uint32_t imm) { const uint32_t i0 = (imm > 0) ? mVWidth : 0; @@ -741,22 +829,42 @@ namespace SwrJit return result; } + Value *Builder::INSERT2_I(Value *a2, Value *b, uint32_t imm) + { + return BITCAST(INSERT2_F(a2, b, imm), mSimd2Int32Ty); + } + #endif ////////////////////////////////////////////////////////////////////////// /// @brief convert x86 mask to llvm mask - Value* Builder::MASK(Value* vmask) + Value *Builder::MASK(Value *vmask) { - Value* src = BITCAST(vmask, mSimdInt32Ty); + Value *src = BITCAST(vmask, mSimdInt32Ty); return ICMP_SLT(src, VIMMED1(0)); } +#if USE_SIMD16_BUILDER + Value *Builder::MASK2(Value *vmask) + { + Value *src = BITCAST(vmask, mSimd2Int32Ty); + return ICMP_SLT(src, VIMMED2_1(0)); + } + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief convert llvm mask to x86 mask - Value* Builder::VMASK(Value* mask) + Value *Builder::VMASK(Value *mask) { return S_EXT(mask, mSimdInt32Ty); } +#if USE_SIMD16_BUILDER + Value *Builder::VMASK2(Value *mask) + { + return S_EXT(mask, mSimd2Int32Ty); + } + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Generate a VPSHUFB operation in LLVM IR. If not /// supported on the underlying platform, emulate it diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 9aa2414..d858a82 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -54,15 +54,25 @@ Value *VIMMED1(int i); Value *VIMMED1(uint32_t i); Value *VIMMED1(float i); Value *VIMMED1(bool i); +#if USE_SIMD16_BUILDER +Value *VIMMED2_1(int i); +Value *VIMMED2_1(uint32_t i); +Value *VIMMED2_1(float i); +Value *VIMMED2_1(bool i); +#endif Value *VUNDEF(Type* t); Value *VUNDEF_F(); +Value *VUNDEF_I(); #if USE_SIMD16_BUILDER Value *VUNDEF2_F(); +Value *VUNDEF2_I(); #endif -Value *VUNDEF_I(); Value *VUNDEF(Type* ty, uint32_t size); Value *VUNDEF_IPTR(); Value *VBROADCAST(Value *src); +#if USE_SIMD16_BUILDER +Value *VBROADCAST2(Value *src); +#endif Value *VRCP(Value *va); Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); @@ -94,8 +104,12 @@ Value *VCMPPS_GE(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GE Value *VCMPPS_GT(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); } Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); } -Value *MASK(Value* vmask); -Value *VMASK(Value* mask); +Value *MASK(Value *vmask); +Value *VMASK(Value *mask); +#if USE_SIMD16_BUILDER +Value *MASK2(Value *vmask); +Value *VMASK2(Value *mask); +#endif ////////////////////////////////////////////////////////////////////////// /// @brief functions that build IR to call x86 intrinsics directly, or @@ -103,8 +117,10 @@ Value *VMASK(Value* mask); ////////////////////////////////////////////////////////////////////////// #if USE_SIMD16_BUILDER -Value *EXTRACT(Value *a, uint32_t imm); -Value *INSERT(Value *a, Value *b, uint32_t imm); +Value *EXTRACT2_F(Value *a2, uint32_t imm); +Value *EXTRACT2_I(Value *a2, uint32_t imm); +Value *INSERT2_F(Value *a2, Value *b, uint32_t imm); +Value *INSERT2_I(Value *a2, Value *b, uint32_t imm); #endif Value *MASKLOADD(Value* src, Value* mask); @@ -112,7 +128,10 @@ Value *MASKLOADD(Value* src, Value* mask); void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); -Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1); +Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); +#if USE_SIMD16_BUILDER +Value *GATHERPS2(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1); +#endif void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index aa3fca4..d409792 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -89,6 +89,9 @@ struct FetchJit : public Builder #else Value* GenerateCompCtrlVector(const ComponentControl ctrl); #endif +#if USE_SIMD16_BUILDER + Value* GenerateCompCtrlVector2(const ComponentControl ctrl); +#endif void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); #if USE_SIMD16_SHADERS @@ -1219,6 +1222,12 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, break; case 32: { +#if USE_SIMD16_GATHERS +#if USE_SIMD16_BUILDER + Value *pVtxSrc2[4]; + +#endif +#endif for (uint32_t i = 0; i < 4; i += 1) { #if USE_SIMD16_GATHERS @@ -1228,7 +1237,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compCtrl[i] == StoreSrc) { // save mask as it is zero'd out after each gather - Value *vMask = vGatherMask; + Value *vMask = vGatherMask; Value *vMask2 = vGatherMask2; // Gather a SIMD of vertices @@ -1236,37 +1245,66 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. - Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); + Value *vShiftedOffsets = VPSRLI(vOffsets, C(1)); Value *vShiftedOffsets2 = VPSRLI(vOffsets2, C(1)); - vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2); +#if USE_SIMD16_BUILDER + Value *src = VUNDEF2_F(); + src = INSERT2_F(src, gatherSrc, 0); + src = INSERT2_F(src, gatherSrc2, 1); + + Value *indices = VUNDEF2_I(); + indices = INSERT2_I(indices, vShiftedOffsets, 0); + indices = INSERT2_I(indices, vShiftedOffsets2, 1); + + Value *mask = VUNDEF2_I(); + mask = INSERT2_I(mask, vMask, 0); + mask = INSERT2_I(mask, vMask2, 1); + + pVtxSrc2[currentVertexElement] = GATHERPS2(src, pStreamBase, indices, mask, 2); +#if 1 + + vVertexElements[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 0); + vVertexElements2[currentVertexElement] = EXTRACT2_F(pVtxSrc2[currentVertexElement], 1); +#endif +#else + vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vMask, 2); vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vMask2, 2); +#if USE_SIMD16_BUILDER + // pack adjacent pairs of SIMD8s into SIMD16s + pVtxSrc2[currentVertexElement] = VUNDEF2_F(); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement], 0); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1); + +#endif +#endif currentVertexElement += 1; } else { +#if USE_SIMD16_BUILDER + pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); +#else vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); +#if USE_SIMD16_BUILDER + // pack adjacent pairs of SIMD8s into SIMD16s + pVtxSrc2[currentVertexElement] = VUNDEF2_F(); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements[currentVertexElement], 0); + pVtxSrc2[currentVertexElement] = INSERT2_F(pVtxSrc2[currentVertexElement], vVertexElements2[currentVertexElement], 1); + +#endif +#endif currentVertexElement += 1; } if (currentVertexElement > 3) { #if USE_SIMD16_BUILDER - Value *pVtxSrc2[4]; - - // pack adjacent pairs of SIMD8s into SIMD16s - for (uint32_t i = 0; i < 4; i += 1) - { - pVtxSrc2[i] = VUNDEF2_F(); - - pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i], 0); - pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1); - } - // store SIMD16s Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0)); + StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); #else @@ -2429,6 +2467,31 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) } } +#if USE_SIMD16_BUILDER +Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl) +{ + switch (ctrl) + { + case NoStore: return VUNDEF2_I(); + case Store0: return VIMMED2_1(0); + case Store1Fp: return VIMMED2_1(1.0f); + case Store1Int: return VIMMED2_1(1); + case StoreVertexId: + { + Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimd2FP32Ty); + return VBROADCAST2(pId); + } + case StoreInstanceId: + { + Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); + return VBROADCAST2(pId); + } + case StoreSrc: + default: SWR_INVALID("Invalid component control"); return VUNDEF2_I(); + } +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Returns the enable mask for the specified component. /// @param enableMask - enable bits -- 2.7.4