From 04d0bfde3962ee76ee7310b3dee5e0f72d2b4c17 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Fri, 22 Dec 2017 13:58:08 -0600 Subject: [PATCH] swr/rast: SIMD16 fetch shader jitter cleanup Bake in USE_SIMD16_BUILDER code paths (for USE_SIMD16_SHADER defined), remove USE_SIMD16_BUILDER define, remove deprecated psuedo-SIMD16 code paths. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 1118 +++++++------------- 1 file changed, 383 insertions(+), 735 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index ac09a82..99a936d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -50,7 +50,6 @@ enum ConversionType #if USE_SIMD16_SHADERS #define USE_SIMD16_GATHERS 0 -#define USE_SIMD16_BUILDER 0 #endif ////////////////////////////////////////////////////////////////////////// @@ -61,6 +60,7 @@ struct FetchJit : public Builder FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){}; Function* Create(const FETCH_COMPILE_STATE& fetchState); + Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex); Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex); @@ -69,43 +69,49 @@ struct FetchJit : public Builder typedef std::tuple Shuffle8bpcArgs; + #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS + void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args); +#else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2); +#endif #else void Shuffle8bpcGatherd(Shuffle8bpcArgs &args); #endif -#if USE_SIMD16_BUILDER - void Shuffle8bpcGatherd2(Shuffle8bpcArgs &args); -#endif typedef std::tuple Shuffle16bpcArgs; + #if USE_SIMD16_SHADERS +#if USE_SIMD16_GATHERS + void Shuffle16bpcGather16(Shuffle16bpcArgs &args); +#else void Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2); +#endif #else void Shuffle16bpcGather(Shuffle16bpcArgs &args); #endif -#if USE_SIMD16_BUILDER - void Shuffle16bpcGather2(Shuffle16bpcArgs &args); -#endif +#if USE_SIMD16_GATHERS + void StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); +#else void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); -#if USE_SIMD16_BUILDER - void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]); #endif #if USE_SIMD16_SHADERS - Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); +#if USE_SIMD16_GATHERS + Value *GenerateCompCtrlVector16(const ComponentControl ctrl); #else - Value* GenerateCompCtrlVector(const ComponentControl ctrl); + Value *GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2); #endif -#if USE_SIMD16_BUILDER - Value* GenerateCompCtrlVector2(const ComponentControl ctrl); +#else + Value *GenerateCompCtrlVector(const ComponentControl ctrl); #endif void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut); -#if USE_SIMD16_SHADERS +#if USE_SIMD16_SHADERS #if USE_SIMD16_GATHERS void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *streams, Value *vIndices, Value *vIndices2, Value *pVtxOut, bool useVertexID2); #else @@ -833,24 +839,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, uint32_t outputElt = 0; Value* vVertexElements[4]; #if USE_SIMD16_GATHERS - Value* vVertexElements2[4]; -#if USE_SIMD16_BUILDER Value *pVtxSrc2[4]; #endif -#endif Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}); Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance}); Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance}); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value* vBaseVertex16 = VBROADCAST_16(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #else Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); #endif -#else - Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex })); -#endif curInstance->setName("curInstance"); for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1) @@ -874,14 +873,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch}); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *vStride16 = VBROADCAST_16(stride); #else Value *vStride = VBROADCAST(stride); #endif -#else - Value *vStride = VBROADCAST(stride); -#endif // max vertex index that is fully in bounds Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)}); @@ -901,23 +896,17 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, curInstance = ADD(curInstance, startInstance); } - Value *vCurIndices; #if USE_SIMD16_GATHERS - Value *vCurIndices2; -#if USE_SIMD16_BUILDER Value *vCurIndices16; -#endif +#else + Value *vCurIndices; #endif Value *startOffset; #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *vInstanceStride16 = VIMMED1_16(0); #else Value *vInstanceStride = VIMMED1(0); #endif -#else - Value *vInstanceStride = VIMMED1(0); -#endif if (ied.InstanceEnable) { @@ -934,14 +923,9 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, calcInstance = SELECT(isNonZeroStep, calcInstance, C(0)); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER vCurIndices16 = VBROADCAST_16(calcInstance); #else vCurIndices = VBROADCAST(calcInstance); - vCurIndices2 = VBROADCAST(calcInstance); -#endif -#else - vCurIndices = VBROADCAST(calcInstance); #endif startOffset = startInstance; @@ -951,27 +935,18 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // grab the instance advancement state, determines stride in bytes from one instance to the next Value* stepRate = C(ied.InstanceAdvancementState); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER vInstanceStride16 = VBROADCAST_16(MUL(curInstance, stepRate)); #else vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); #endif -#else - vInstanceStride = VBROADCAST(MUL(curInstance, stepRate)); -#endif // offset indices by baseVertex #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *vIndices16 = JOIN_16(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); #else vCurIndices = ADD(vIndices, vBaseVertex); - vCurIndices2 = ADD(vIndices2, vBaseVertex); -#endif -#else - vCurIndices = ADD(vIndices, vBaseVertex); #endif startOffset = startVertex; @@ -981,16 +956,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { // offset indices by baseVertex #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *vIndices16 = JOIN_16(vIndices, vIndices2); vCurIndices16 = ADD(vIndices16, vBaseVertex16); #else vCurIndices = ADD(vIndices, vBaseVertex); - vCurIndices2 = ADD(vIndices2, vBaseVertex); -#endif -#else - vCurIndices = ADD(vIndices, vBaseVertex); #endif startOffset = startVertex; @@ -1021,7 +991,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)}); partialInboundsSize = LOAD(partialInboundsSize); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *vPartialVertexSize = VBROADCAST_16(partialInboundsSize); Value *vBpp = VBROADCAST_16(C(info.Bpp)); Value *vAlignmentOffsets = VBROADCAST_16(C(ied.AlignedByteOffset)); @@ -1030,17 +999,11 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vBpp = VBROADCAST(C(info.Bpp)); Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); #endif -#else - Value *vPartialVertexSize = VBROADCAST(partialInboundsSize); - Value *vBpp = VBROADCAST(C(info.Bpp)); - Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset)); -#endif // is the element is <= the partially valid size Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets)); #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER // override cur indices with 0 if pitch is 0 Value *pZeroPitchMask16 = ICMP_EQ(vStride16, VIMMED1_16(0)); vCurIndices16 = SELECT(pZeroPitchMask16, VIMMED1_16(0), vCurIndices16); @@ -1091,58 +1054,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // override cur indices with 0 if pitch is 0 Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); - vCurIndices2 = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices2); - - // are vertices partially OOB? - Value* vMaxVertex = VBROADCAST(maxVertex); - Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex); - Value* vPartialOOBMask2 = ICMP_EQ(vCurIndices2, vMaxVertex); - - // are vertices fully in bounds? - Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex); - Value* vMaxGatherMask2 = ICMP_ULT(vCurIndices2, vMaxVertex); - - Value *vGatherMask; - Value *vGatherMask2; - if (fetchState.bPartialVertexBuffer) - { - // are vertices below minVertex limit? - Value *vMinVertex = VBROADCAST(minVertex); - Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex); - Value *vMinGatherMask2 = ICMP_UGE(vCurIndices2, vMinVertex); - - // only fetch lanes that pass both tests - vGatherMask = AND(vMaxGatherMask, vMinGatherMask); - vGatherMask2 = AND(vMaxGatherMask2, vMinGatherMask2); - } - else - { - vGatherMask = vMaxGatherMask; - vGatherMask2 = vMaxGatherMask2; - } - - // blend in any partially OOB indices that have valid elements - vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask); - vGatherMask2 = SELECT(vPartialOOBMask2, vElementInBoundsMask, vGatherMask2); - - // calculate the actual offsets into the VB - Value* vOffsets = MUL(vCurIndices, vStride); - vOffsets = ADD(vOffsets, vAlignmentOffsets); - - Value* vOffsets2 = MUL(vCurIndices2, vStride); - vOffsets2 = ADD(vOffsets2, vAlignmentOffsets); - - // if instance stride enable is: - // true - add product of the instanceID and advancement state to the offst into the VB - // false - value of vInstanceStride has been initialialized to zero - vOffsets = ADD(vOffsets, vInstanceStride); - vOffsets2 = ADD(vOffsets2, vInstanceStride); - -#endif -#else - // override cur indices with 0 if pitch is 0 - Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0)); - vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices); // are vertices partially OOB? Value* vMaxVertex = VBROADCAST(maxVertex); @@ -1190,7 +1101,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, #if USE_SIMD16_GATHERS Value *pResults[4]; Value *pResults2[4]; - CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); + CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask2, pStreamBase, vOffsets2, pResults2); ConvertFormat((SWR_FORMAT)ied.Format, pResults); ConvertFormat((SWR_FORMAT)ied.Format, pResults2); @@ -1199,43 +1110,26 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, { if (isComponentEnabled(compMask, c)) { -#if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN_16(pResults[c], pResults2[c]); - -#else - vVertexElements[currentVertexElement] = pResults[c]; - vVertexElements2[currentVertexElement] = pResults2[c]; - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = JOIN_16(pResults[c], pResults2[c]); if (currentVertexElement > 3) { -#if USE_SIMD16_BUILDER // store SIMD16s Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; - + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); // reset to the next vVertexElement to output currentVertexElement = 0; } } } #else - Value* pResults[4]; + Value *pResults[4]; CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults); ConvertFormat((SWR_FORMAT)ied.Format, pResults); - for (uint32_t c = 0; c < 4; ++c) + for (uint32_t c = 0; c < 4; c += 1) { if (isComponentEnabled(compMask, c)) { @@ -1255,11 +1149,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, ///@todo: support 64 bit vb accesses Value *gatherSrc = VIMMED1(0.0f); #if USE_SIMD16_GATHERS - Value *gatherSrc2 = VIMMED1(0.0f); -#if USE_SIMD16_BUILDER Value *gatherSrc16 = VIMMED1_16(0.0f); #endif -#endif SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), "Unsupported format for standard gather fetch."); @@ -1270,7 +1161,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *gatherResult[2]; // if we have at least one component out of x or y to fetch @@ -1306,73 +1196,23 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, gatherResult[1] = VUNDEF_I_16(); } -#else - Value *vGatherResult[2]; - Value *vGatherResult2[2]; - - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[0] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } - else - { - vGatherResult[0] = VUNDEF_I(); - vGatherResult2[0] = VUNDEF_I(); - } - - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - - vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[1] = GATHERPS(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = VUNDEF_I(); - vGatherResult2[1] = VUNDEF_I(); - } - -#endif // if we have at least one component to shuffle into place if (compMask) { -#if USE_SIMD16_BUILDER Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, Instruction::CastOps::FPExt, CONVERT_NONE, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather2(args); // outputs to vVertexElements ref -#else - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); - Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), Instruction::CastOps::FPExt, CONVERT_NONE, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); - - // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather(args, false); // outputs to vVertexElements ref - Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref -#endif + Shuffle16bpcGather16(args); // outputs to vVertexElements ref } #else - Value* vGatherResult[2]; + Value *vGatherResult[2]; // if we have at least one component out of x or y to fetch - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 @@ -1381,7 +1221,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have at least one component out of z or w to fetch - if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { // offset base to the next components(zw) in the vertex to gather pStreamBase = GEP(pStreamBase, C((char)4)); @@ -1393,7 +1234,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have at least one component to shuffle into place - if(compMask){ + if (compMask) + { Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); @@ -1422,59 +1264,20 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. -#if USE_SIMD16_BUILDER - Value *shiftedOffsets = LSHR(vOffsets16, 1); - pVtxSrc2[currentVertexElement] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets, vGatherMask16, 2); - -#else - Value *vShiftedOffsets = LSHR(vOffsets, 1); - Value *vShiftedOffsets2 = LSHR(vOffsets2, 1); - - vVertexElements[currentVertexElement] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2); - vVertexElements2[currentVertexElement] = GATHERPS(gatherSrc2, pStreamBase, vShiftedOffsets2, vGatherMask2, 2); - -#if USE_SIMD16_BUILDER - // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement], - vVertexElements2[currentVertexElement]); - -#endif -#endif - currentVertexElement += 1; + Value *shiftedOffsets16 = LSHR(vOffsets16, 1); + pVtxSrc2[currentVertexElement++] = GATHERPS_16(gatherSrc16, pStreamBase, shiftedOffsets16, vGatherMask16, 2); } else { -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); -#else - vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); - vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); - -#if USE_SIMD16_BUILDER - // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN_16(vVertexElements[currentVertexElement], - vVertexElements2[currentVertexElement]); - -#endif -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { -#if USE_SIMD16_BUILDER // store SIMD16s Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; - + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -1493,7 +1296,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :( // But, we know that elements must be aligned for FETCH. :) // Right shift the offset by a bit and then scale by 2 to remove the sign extension. - Value* vShiftedOffsets = LSHR(vOffsets, 1); + Value *vShiftedOffsets = LSHR(vOffsets, 1); vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vShiftedOffsets, vGatherMask, 2); } else @@ -1554,45 +1357,20 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); Value *pGather2 = VSHUFFLE(pGatherLo2, pGatherHi2, C({ 0, 1, 2, 3, 4, 5, 6, 7 })); -#if USE_SIMD16_BUILDER // pack adjacent pairs of SIMD8s into SIMD16s - pVtxSrc2[currentVertexElement] = JOIN_16(pGather, pGather2); - -#else - vVertexElements[currentVertexElement] = pGather; - vVertexElements2[currentVertexElement] = pGather2; - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = JOIN_16(pGather, pGather2); } else { -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); - -#else - vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); - vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { -#if USE_SIMD16_BUILDER // store SIMD16s Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; - + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -1614,10 +1392,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value *vZeroDouble = VECTOR_SPLAT(4, ConstantFP::get(IRB()->getDoubleTy(), 0.0f)); - Value* pGatherLo = GATHERPD(vZeroDouble, - pStreamBase, vOffsetsLo, vMaskLo); - Value* pGatherHi = GATHERPD(vZeroDouble, - pStreamBase, vOffsetsHi, vMaskHi); + Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo); + Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi); pGatherLo = VCVTPD2PS(pGatherLo); pGatherHi = VCVTPD2PS(pGatherHi); @@ -1693,11 +1469,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, // value substituted when component of gather is masked Value* gatherSrc = VIMMED1(0); #if USE_SIMD16_GATHERS - Value* gatherSrc2 = VIMMED1(0); -#if USE_SIMD16_BUILDER Value *gatherSrc16 = VIMMED1_16(0); #endif -#endif // Gather components from memory to store in a simdvertex structure switch (bpc) @@ -1708,42 +1481,21 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compMask) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *gatherResult = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw -#else - Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - Value* vGatherResult2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - - // e.g. result of an 8x32bit integer gather for 8bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw - -#endif -#if USE_SIMD16_BUILDER Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle8bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2, info.swizzle); // Shuffle gathered components into place in simdvertex struct - Shuffle8bpcGatherd2(args); // outputs to vVertexElements ref -#else - Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle); - Shuffle8bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2, info.swizzle); - - // Shuffle gathered components into place in simdvertex struct - Shuffle8bpcGatherd(args, false); // outputs to vVertexElements ref - Shuffle8bpcGatherd(args2, true); // outputs to vVertexElements ref -#endif + Shuffle8bpcGatherd16(args); // outputs to vVertexElements ref #else - Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); + Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of an 8x32bit integer gather for 8bit components // 256i - 0 1 2 3 4 5 6 7 // xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw @@ -1764,8 +1516,7 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, case 16: { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - Value* gatherResult[2]; + Value *gatherResult[2]; // if we have at least one component out of x or y to fetch if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) @@ -1800,73 +1551,23 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, gatherResult[1] = VUNDEF_I_16(); } -#else - Value* vGatherResult[2]; - Value* vGatherResult2[2]; - - // if we have at least one component out of x or y to fetch - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[0] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - // e.g. result of first 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy - // - } - else - { - vGatherResult[0] = VUNDEF_I(); - vGatherResult2[0] = VUNDEF_I(); - } - - // if we have at least one component out of z or w to fetch - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - // offset base to the next components(zw) in the vertex to gather - pStreamBase = GEP(pStreamBase, C((char)4)); - - vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - vGatherResult2[1] = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); - // e.g. result of second 8x32bit integer gather for 16bit components - // 256i - 0 1 2 3 4 5 6 7 - // zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw - // - } - else - { - vGatherResult[1] = VUNDEF_I(); - vGatherResult2[1] = VUNDEF_I(); - } - -#endif // if we have at least one component to shuffle into place if (compMask) { -#if USE_SIMD16_BUILDER Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); Shuffle16bpcArgs args = std::forward_as_tuple(gatherResult, pVtxOut2, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, pVtxSrc2); // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather2(args); // outputs to vVertexElements ref -#else - Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); - Shuffle16bpcArgs args2 = std::forward_as_tuple(vGatherResult2, GEP(pVtxOut, C(1)), extendCastType, conversionType, - currentVertexElement, outputElt, compMask, compCtrl, vVertexElements2); - - // Shuffle gathered components into place in simdvertex struct - Shuffle16bpcGather(args, false); // outputs to vVertexElements ref - Shuffle16bpcGather(args2, true); // outputs to vVertexElements ref -#endif + Shuffle16bpcGather16(args); // outputs to vVertexElements ref } #else - Value* vGatherResult[2]; + Value *vGatherResult[2]; // if we have at least one component out of x or y to fetch - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); // e.g. result of first 8x32bit integer gather for 16bit components // 256i - 0 1 2 3 4 5 6 7 @@ -1875,7 +1576,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have at least one component out of z or w to fetch - if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { // offset base to the next components(zw) in the vertex to gather pStreamBase = GEP(pStreamBase, C((char)4)); @@ -1887,7 +1589,8 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, } // if we have at least one component to shuffle into place - if(compMask){ + if (compMask) + { Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType, currentVertexElement, outputElt, compMask, compCtrl, vVertexElements); @@ -1912,7 +1615,6 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (compCtrl[i] == StoreSrc) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER Value *pGather = GATHERDD_16(gatherSrc16, pStreamBase, vOffsets16, vGatherMask16); if (conversionType == CONVERT_USCALED) @@ -1928,99 +1630,54 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, pGather = FMUL(SI_TO_FP(pGather, mSimd16FP32Ty), VBROADCAST_16(C(1 / 65536.0f))); } + pVtxSrc2[currentVertexElement++] = pGather; + + // e.g. result of a single 8x32bit integer gather for 32bit components + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx #else - Value *pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - Value *pGather2 = GATHERDD(gatherSrc2, pStreamBase, vOffsets2, vGatherMask2); + Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); if (conversionType == CONVERT_USCALED) { pGather = UI_TO_FP(pGather, mSimdFP32Ty); - pGather2 = UI_TO_FP(pGather2, mSimdFP32Ty); } else if (conversionType == CONVERT_SSCALED) { pGather = SI_TO_FP(pGather, mSimdFP32Ty); - pGather2 = SI_TO_FP(pGather2, mSimdFP32Ty); } else if (conversionType == CONVERT_SFIXED) { - pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); - pGather2 = FMUL(SI_TO_FP(pGather2, mSimdFP32Ty), VBROADCAST(C(1 / 65536.0f))); + pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f))); } -#endif -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = pGather; - -#else - vVertexElements[currentVertexElement] = pGather; - vVertexElements2[currentVertexElement] = pGather2; - -#endif + vVertexElements[currentVertexElement++] = pGather; // e.g. result of a single 8x32bit integer gather for 32bit components // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx - - currentVertexElement += 1; -#else - Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask); - - if (conversionType == CONVERT_USCALED) - { - pGather = UI_TO_FP(pGather, mSimdFP32Ty); - } - else if (conversionType == CONVERT_SSCALED) - { - pGather = SI_TO_FP(pGather, mSimdFP32Ty); - } - else if (conversionType == CONVERT_SFIXED) - { - pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f))); - } - - vVertexElements[currentVertexElement++] = pGather; - // e.g. result of a single 8x32bit integer gather for 32bit components - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx #endif } else { -#if USE_SIMD16_SHADERS #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER - pVtxSrc2[currentVertexElement] = GenerateCompCtrlVector2(compCtrl[i]); - -#else - vVertexElements[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], false); - vVertexElements2[currentVertexElement] = GenerateCompCtrlVector(compCtrl[i], true); - -#endif - currentVertexElement += 1; + pVtxSrc2[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); #else +#if USE_SIMD16_SHADERS vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#endif #else vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); #endif +#endif } if (currentVertexElement > 3) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER // store SIMD16s Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); - StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2); - -#endif - outputElt += 1; + StoreVertexElements16(pVtxOut2, outputElt++, 4, pVtxSrc2); #else StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); #endif @@ -2044,18 +1701,10 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, if (currentVertexElement > 0) { #if USE_SIMD16_GATHERS -#if USE_SIMD16_BUILDER // store SIMD16s Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth16), 0)); - StoreVertexElements2(pVtxOut2, outputElt, currentVertexElement, pVtxSrc2); - -#else - StoreVertexElements(pVtxOut, outputElt, currentVertexElement, vVertexElements); - StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, currentVertexElement, vVertexElements2); - -#endif - outputElt += 1; + StoreVertexElements16(pVtxOut2, outputElt++, currentVertexElement, pVtxSrc2); #else StoreVertexElements(pVtxOut, outputElt++, currentVertexElement, vVertexElements); #endif @@ -2183,109 +1832,8 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex) /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output /// @param swizzle[4] - component swizzle location -#if USE_SIMD16_SHADERS -void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2) -#else -void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) -#endif -{ - // Unpack tuple args - Value*& vGatherResult = std::get<0>(args); - Value* pVtxOut = std::get<1>(args); - const Instruction::CastOps extendType = std::get<2>(args); - const ConversionType conversionType = std::get<3>(args); - uint32_t ¤tVertexElement = std::get<4>(args); - uint32_t &outputElt = std::get<5>(args); - const ComponentEnable compMask = std::get<6>(args); - const ComponentControl (&compCtrl)[4] = std::get<7>(args); - Value* (&vVertexElements)[4] = std::get<8>(args); - const uint32_t (&swizzle)[4] = std::get<9>(args); - - // cast types - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits - - for (uint32_t i = 0; i < 4; i++) - { - if (!isComponentEnabled(compMask, i)) - continue; - - if (compCtrl[i] == ComponentControl::StoreSrc) - { - std::vector vShuffleMasks[4] = { - { 0, 4, 8, 12, 16, 20, 24, 28 }, // x - { 1, 5, 9, 13, 17, 21, 25, 29 }, // y - { 2, 6, 10, 14, 18, 22, 26, 30 }, // z - { 3, 7, 11, 15, 19, 23, 27, 31 }, // w - }; - - Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), - UndefValue::get(v32x8Ty), - vShuffleMasks[swizzle[i]]); - - if ((extendType == Instruction::CastOps::SExt) || - (extendType == Instruction::CastOps::SIToFP)) { - switch (conversionType) - { - case CONVERT_NORMALIZED: - val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); - break; - case CONVERT_SSCALED: - val = SI_TO_FP(val, mSimdFP32Ty); - break; - case CONVERT_USCALED: - SWR_INVALID("Type should not be sign extended!"); - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - val = S_EXT(val, mSimdInt32Ty); - break; - } - } else if ((extendType == Instruction::CastOps::ZExt) || - (extendType == Instruction::CastOps::UIToFP)) { - switch (conversionType) - { - case CONVERT_NORMALIZED: - val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); - break; - case CONVERT_SSCALED: - SWR_INVALID("Type should not be zero extended!"); - break; - case CONVERT_USCALED: - val = UI_TO_FP(val, mSimdFP32Ty); - break; - default: - SWR_ASSERT(conversionType == CONVERT_NONE); - val = Z_EXT(val, mSimdInt32Ty); - break; - } - } - else - { - SWR_INVALID("Unsupported conversion type"); - } - - vVertexElements[currentVertexElement++] = val; - } - else - { -#if USE_SIMD16_SHADERS - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#else - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -#endif - } - - if (currentVertexElement > 3) - { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); - // reset to the next vVertexElement to output - currentVertexElement = 0; - } - } -} - -#if USE_SIMD16_BUILDER -void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) +#if USE_SIMD16_GATHERS +void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args) { // Unpack tuple args Value*& vGatherResult = std::get<0>(args); @@ -2408,12 +1956,12 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2506,12 +2054,12 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2524,6 +2072,109 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) } } +#else +#if USE_SIMD16_SHADERS +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args, bool useVertexID2) +#else +void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) +#endif +{ + // Unpack tuple args + Value*& vGatherResult = std::get<0>(args); + Value* pVtxOut = std::get<1>(args); + const Instruction::CastOps extendType = std::get<2>(args); + const ConversionType conversionType = std::get<3>(args); + uint32_t ¤tVertexElement = std::get<4>(args); + uint32_t &outputElt = std::get<5>(args); + const ComponentEnable compMask = std::get<6>(args); + const ComponentControl(&compCtrl)[4] = std::get<7>(args); + Value* (&vVertexElements)[4] = std::get<8>(args); + const uint32_t(&swizzle)[4] = std::get<9>(args); + + // cast types + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + + for (uint32_t i = 0; i < 4; i++) + { + if (!isComponentEnabled(compMask, i)) + continue; + + if (compCtrl[i] == ComponentControl::StoreSrc) + { + std::vector vShuffleMasks[4] = { + { 0, 4, 8, 12, 16, 20, 24, 28 }, // x + { 1, 5, 9, 13, 17, 21, 25, 29 }, // y + { 2, 6, 10, 14, 18, 22, 26, 30 }, // z + { 3, 7, 11, 15, 19, 23, 27, 31 }, // w + }; + + Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty), + UndefValue::get(v32x8Ty), + vShuffleMasks[swizzle[i]]); + + if ((extendType == Instruction::CastOps::SExt) || + (extendType == Instruction::CastOps::SIToFP)) { + switch (conversionType) + { + case CONVERT_NORMALIZED: + val = FMUL(SI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 127.0))); + break; + case CONVERT_SSCALED: + val = SI_TO_FP(val, mSimdFP32Ty); + break; + case CONVERT_USCALED: + SWR_INVALID("Type should not be sign extended!"); + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + val = S_EXT(val, mSimdInt32Ty); + break; + } + } + else if ((extendType == Instruction::CastOps::ZExt) || + (extendType == Instruction::CastOps::UIToFP)) { + switch (conversionType) + { + case CONVERT_NORMALIZED: + val = FMUL(UI_TO_FP(val, mSimdFP32Ty), VIMMED1((float)(1.0 / 255.0))); + break; + case CONVERT_SSCALED: + SWR_INVALID("Type should not be zero extended!"); + break; + case CONVERT_USCALED: + val = UI_TO_FP(val, mSimdFP32Ty); + break; + default: + SWR_ASSERT(conversionType == CONVERT_NONE); + val = Z_EXT(val, mSimdInt32Ty); + break; + } + } + else + { + SWR_INVALID("Unsupported conversion type"); + } + + vVertexElements[currentVertexElement++] = val; + } + else + { +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif + } + + if (currentVertexElement > 3) + { + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + // reset to the next vVertexElement to output + currentVertexElement = 0; + } + } +} + #endif ////////////////////////////////////////////////////////////////////////// /// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, @@ -2539,11 +2190,8 @@ void FetchJit::Shuffle8bpcGatherd2(Shuffle8bpcArgs &args) /// @param compMask - component packing mask /// @param compCtrl - component control val /// @param vVertexElements[4] - vertex components to output -#if USE_SIMD16_SHADERS -void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2) -#else -void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) -#endif +#if USE_SIMD16_GATHERS +void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) { // Unpack tuple args Value* (&vGatherResult)[2] = std::get<0>(args); @@ -2557,45 +2205,63 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) Value* (&vVertexElements)[4] = std::get<8>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| - (extendType == Instruction::CastOps::FPExt)) + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) { // is this PP float? bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask - Value* vConstMask = C({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}); - Value* vi128XY = nullptr; - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){ - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); - // after pshufb: group components together in each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - - vi128XY = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); - // after PERMD: move and pack xy components into each 128bit lane - // 256i - 0 1 2 3 4 5 6 7 - // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy - } - + Value *vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); + Value *vi128XY_lo = nullptr; + Value *vi128XY_hi = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) + { + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + // after pshufb: group components together in each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy + + vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + + // after PERMD: move and pack xy components into each 128bit lane + // 256i - 0 1 2 3 4 5 6 7 + // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy + } + // do the same for zw components - Value* vi128ZW = nullptr; - if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){ - Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); - vi128ZW = BITCAST(PERMD(vShufResult, C({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy); + Value *vi128ZW_lo = nullptr; + Value *vi128ZW_hi = nullptr; + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) + { + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1); + + Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); + Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + + vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); + vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); } // init denormalize variables if needed Instruction::CastOps IntToFpCast; - Value* conversionFactor; + Value *conversionFactor; switch (conversionType) { @@ -2627,35 +2293,43 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW - Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; + Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; + Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; - if (bFP) { + if (bFP) + { // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } - else { + else + { // extract 128 bit lanes to sign extend each component - vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); + Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); // denormalize if needed - if (conversionType != CONVERT_NONE) { - vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + if (conversionType != CONVERT_NONE) + { + temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor); } + + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } - currentVertexElement++; + + currentVertexElement += 1; } else { -#if USE_SIMD16_SHADERS - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#else - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -#endif + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2666,17 +2340,20 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) { // pshufb masks for each component - Value* vConstMask[2]; - if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){ + Value *vConstMask[2]; + + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) + { // x/z shuffle mask - vConstMask[0] = C({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, - 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); + vConstMask[0] = C({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, + 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); } - - if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){ + + if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) + { // y/w shuffle mask - vConstMask[1] = C({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, - 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1}); + vConstMask[1] = C({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, + 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); } // init denormalize variables if needed @@ -2715,7 +2392,14 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. + + Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); + Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); + + Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); + // after pshufb mask for x channel; z uses the same shuffle from the second gather // 256i - 0 1 2 3 4 5 6 7 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 @@ -2723,22 +2407,22 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) // denormalize if needed if (conversionType != CONVERT_NONE) { - vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); + temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); + temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); } - currentVertexElement++; + + vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); + + currentVertexElement += 1; } else { -#if USE_SIMD16_SHADERS - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); -#else - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); -#endif + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector16(compCtrl[i]); } if (currentVertexElement > 3) { - StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements16(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2751,8 +2435,12 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) } } -#if USE_SIMD16_BUILDER -void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) +#else +#if USE_SIMD16_SHADERS +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args, bool useVertexID2) +#else +void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) +#endif { // Unpack tuple args Value* (&vGatherResult)[2] = std::get<0>(args); @@ -2766,71 +2454,45 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) Value* (&vVertexElements)[4] = std::get<8>(args); // cast types - Type *vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); - Type *v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits - // have to do extra work for sign extending - if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt)) + // have to do extra work for sign extending + if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || + (extendType == Instruction::CastOps::FPExt)) { // is this PP float? bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; - Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits - - // shuffle mask - Value *vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, - 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); - Value *vi128XY = nullptr; - Value *vi128XY_lo = nullptr; - Value *vi128XY_hi = nullptr; - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) - { - // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - - Value *vGatherResult_lo = EXTRACT_16(vGatherResult[0], 0); - Value *vGatherResult_hi = EXTRACT_16(vGatherResult[0], 1); - - Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); + Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits + // shuffle mask + Value* vConstMask = C({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, + 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }); + Value* vi128XY = nullptr; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) { + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy); // after pshufb: group components together in each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy - vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - + vi128XY = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); // after PERMD: move and pack xy components into each 128bit lane // 256i - 0 1 2 3 4 5 6 7 // xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy -#if 0 - vi128XY = JOIN_16(vi128XY_lo, vi128XY_hi); -#endif } // do the same for zw components - Value *vi128ZW = nullptr; - Value *vi128ZW_lo = nullptr; - Value *vi128ZW_hi = nullptr; - if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) - { - Value *vGatherResult_lo = EXTRACT_16(vGatherResult[1], 0); - Value *vGatherResult_hi = EXTRACT_16(vGatherResult[1], 1); - - Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy); - Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy); - - vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); - vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); -#if 0 - vi128ZW = JOIN_16(vi128ZW_lo, vi128ZW_hi); -#endif + Value* vi128ZW = nullptr; + if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) { + Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy); + vi128ZW = BITCAST(PERMD(vShufResult, C({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy); } // init denormalize variables if needed Instruction::CastOps IntToFpCast; - Value *conversionFactor; + Value* conversionFactor; switch (conversionType) { @@ -2862,43 +2524,35 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1 uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; // if x or y, use vi128XY permute result, else use vi128ZW - Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo; - Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi; + Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW; - if (bFP) - { + if (bFP) { // extract 128 bit lanes to sign extend each component - Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); - - vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); + vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); } - else - { + else { // extract 128 bit lanes to sign extend each component - Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty)); // denormalize if needed - if (conversionType != CONVERT_NONE) - { - temp_lo = FMUL(CAST(IntToFpCast, temp_lo, mSimdFP32Ty), conversionFactor); - temp_hi = FMUL(CAST(IntToFpCast, temp_hi, mSimdFP32Ty), conversionFactor); + if (conversionType != CONVERT_NONE) { + vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); } - - vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } - - currentVertexElement += 1; + currentVertexElement++; } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -2909,17 +2563,14 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP)) { // pshufb masks for each component - Value *vConstMask[2]; - - if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) - { + Value* vConstMask[2]; + if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) { // x/z shuffle mask vConstMask[0] = C({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, }); } - if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) - { + if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) { // y/w shuffle mask vConstMask[1] = C({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1, 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 }); @@ -2961,14 +2612,7 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // if x or y, use vi128XY permute result, else use vi128ZW uint32_t selectedGather = (i < 2) ? 0 : 1; - // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now.. - - Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0); - Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1); - - Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy); - Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy); - + vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy); // after pshufb mask for x channel; z uses the same shuffle from the second gather // 256i - 0 1 2 3 4 5 6 7 // xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 @@ -2976,22 +2620,22 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) // denormalize if needed if (conversionType != CONVERT_NONE) { - temp_lo = FMUL(CAST(fpCast, temp_lo, mSimdFP32Ty), conversionFactor); - temp_hi = FMUL(CAST(fpCast, temp_hi, mSimdFP32Ty), conversionFactor); + vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor); } - - vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); - - currentVertexElement += 1; + currentVertexElement++; } else { - vVertexElements[currentVertexElement++] = GenerateCompCtrlVector2(compCtrl[i]); +#if USE_SIMD16_SHADERS + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i], useVertexID2); +#else + vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]); +#endif } if (currentVertexElement > 3) { - StoreVertexElements2(pVtxOut, outputElt++, 4, vVertexElements); + StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements); // reset to the next vVertexElement to output currentVertexElement = 0; } @@ -3011,39 +2655,36 @@ void FetchJit::Shuffle16bpcGather2(Shuffle16bpcArgs &args) /// @param outputElt - simdvertex offset in VIN to write to /// @param numEltsToStore - number of simdvertex rows to write out /// @param vVertexElements - LLVM Value*[] simdvertex to write out -void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +#if USE_SIMD16_GATHERS +void FetchJit::StoreVertexElements16(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) { SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); - for(uint32_t c = 0; c < numEltsToStore; ++c) + for (uint32_t c = 0; c < numEltsToStore; ++c) { // STORE expects FP32 x vWidth type, just bitcast if needed - if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) + if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy()) { #if FETCH_DUMP_VERTEX - PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]}); + PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); #endif - vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); + vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty); } #if FETCH_DUMP_VERTEX else { - PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]}); + PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] }); } #endif // outputElt * 4 = offsetting by the size of a simdvertex // + c offsets to a 32bit x vWidth row within the current vertex -#if USE_SIMD16_SHADERS - Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP"); -#else Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); -#endif STORE(vVertexElements[c], dest); } } -#if USE_SIMD16_BUILDER -void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) +#else +void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]) { SWR_ASSERT(numEltsToStore <= 4, "Invalid element count."); @@ -3055,7 +2696,7 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co #if FETCH_DUMP_VERTEX PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] }); #endif - vVertexElements[c] = BITCAST(vVertexElements[c], mSimd16FP32Ty); + vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty); } #if FETCH_DUMP_VERTEX else @@ -3065,7 +2706,11 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co #endif // outputElt * 4 = offsetting by the size of a simdvertex // + c offsets to a 32bit x vWidth row within the current vertex +#if USE_SIMD16_SHADERS + Value* dest = GEP(pVtxOut, C(outputElt * 8 + c * 2), "destGEP"); +#else Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP"); +#endif STORE(vVertexElements[c], dest); } } @@ -3075,21 +2720,56 @@ void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, co /// @brief Generates a constant vector of values based on the /// ComponentControl value /// @param ctrl - ComponentControl value +#if USE_SIMD16_GATHERS +Value *FetchJit::GenerateCompCtrlVector16(const ComponentControl ctrl) +{ + switch (ctrl) + { + case NoStore: + return VUNDEF_I_16(); + case Store0: + return VIMMED1_16(0); + case Store1Fp: + return VIMMED1_16(1.0f); + case Store1Int: + return VIMMED1_16(1); + case StoreVertexId: + { + Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); + Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); + + Value *pId = JOIN_16(pId_lo, pId_hi); + + return pId; + } + case StoreInstanceId: + { + Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); + return VBROADCAST_16(pId); + } + case StoreSrc: + default: + SWR_INVALID("Invalid component control"); + return VUNDEF_I_16(); + } +} + +#else #if USE_SIMD16_SHADERS -Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2) +Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2) #else -Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) +Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) #endif { - switch(ctrl) + switch (ctrl) { - case NoStore: + case NoStore: return VUNDEF_I(); - case Store0: + case Store0: return VIMMED1(0); - case Store1Fp: + case Store1Fp: return VIMMED1(1.0f); - case Store1Int: + case Store1Int: return VIMMED1(1); case StoreVertexId: { @@ -3114,41 +2794,9 @@ Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl) return VBROADCAST(pId); } case StoreSrc: - default: - SWR_INVALID("Invalid component control"); return VUNDEF_I(); - } -} - -#if USE_SIMD16_BUILDER -Value* FetchJit::GenerateCompCtrlVector2(const ComponentControl ctrl) -{ - switch (ctrl) - { - case NoStore: - return VUNDEF_I_16(); - case Store0: - return VIMMED1_16(0); - case Store1Fp: - return VIMMED1_16(1.0f); - case Store1Int: - return VIMMED1_16(1); - case StoreVertexId: - { - Value *pId_lo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty); - Value *pId_hi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), mSimdFP32Ty); - - Value *pId = JOIN_16(pId_lo, pId_hi); - - return pId; - } - case StoreInstanceId: - { - Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty); - return VBROADCAST_16(pId); - } - case StoreSrc: - default: - SWR_INVALID("Invalid component control"); return VUNDEF_I_16(); + default: + SWR_INVALID("Invalid component control"); + return VUNDEF_I(); } } -- 2.7.4