From 549b9d2e9f1547af3fb061a7956b04fb30870a6d Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 20 Mar 2017 12:17:07 -0500 Subject: [PATCH] swr: [rasterizer core] SIMD16 Frontend WIP Fix GS and streamout. Reviewed-by: George Kyriazis --- src/gallium/drivers/swr/rasterizer/core/clip.h | 61 ++++++++++++++ .../drivers/swr/rasterizer/core/frontend.cpp | 97 +++++++++++++++++----- 2 files changed, 136 insertions(+), 22 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index eec6570..3a79d6a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -376,7 +376,16 @@ public: const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1); uint32_t numClippedPrims = 0; +#if USE_SIMD16_FRONTEND + const uint32_t numPrims = pa.NumPrims(); + const uint32_t numPrims_lo = std::min(numPrims, KNOB_SIMD_WIDTH); + + SWR_ASSERT(numPrims <= numPrims_lo); + + for (uint32_t inputPrim = 0; inputPrim < numPrims_lo; ++inputPrim) +#else for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim) +#endif { uint32_t numEmittedVerts = pVertexCount[inputPrim]; if (numEmittedVerts < NumVertsPerPrim) @@ -391,13 +400,28 @@ public: // tranpose clipper output so that each lane's vertices are in SIMD order // set aside space for 2 vertices, as the PA will try to read up to 16 verts // for triangle fan +#if USE_SIMD16_FRONTEND + simd16vertex transposedPrims[2]; +#else simdvertex transposedPrims[2]; +#endif // transpose pos uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim; + +#if USE_SIMD16_FRONTEND + // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug - use dx11_clipping_03-09 failures to check for existence of bug + static const float *dummy = reinterpret_cast(pBase); +#endif + for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } @@ -408,7 +432,12 @@ public: uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib; for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[attribSlot][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } } @@ -419,7 +448,12 @@ public: pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } } @@ -429,7 +463,12 @@ public: pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; for (uint32_t c = 0; c < 4; ++c) { +#if USE_SIMD16_FRONTEND + simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(), (const float *)pBase, vOffsets, vMask, 1); + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); +#else transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); +#endif pBase += sizeof(simdscalar); } } @@ -440,6 +479,27 @@ public: { do { +#if USE_SIMD16_FRONTEND + simd16vector attrib_simd16[NumVertsPerPrim]; + bool assemble = clipPa.Assemble_simd16(VERTEX_POSITION_SLOT, attrib_simd16); + + if (assemble) + { + static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; + + simdvector attrib[NumVertsPerPrim]; + for (uint32_t i = 0; i < NumVertsPerPrim; i += 1) + { + for (uint32_t j = 0; j < 4; j += 1) + { + attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0); + } + } + + clipPa.useAlternateOffset = false; + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim])); + } +#else simdvector attrib[NumVertsPerPrim]; bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib); if (assemble) @@ -447,6 +507,7 @@ public: static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim])); } +#endif } while (clipPa.NextPrim()); } } diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index acbd779..eb52594 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -495,6 +495,9 @@ static void StreamOut( PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, +#if USE_SIMD16_FRONTEND + uint32_t numPrims_simd8, +#endif uint32_t streamIndex) { SWR_CONTEXT *pContext = pDC->pContext; @@ -517,7 +520,12 @@ static void StreamOut( soContext.pBuffer[i] = &state.soBuffer[i]; } +#if USE_SIMD16_FRONTEND + uint32_t numPrims = numPrims_simd8; +#else uint32_t numPrims = pa.NumPrims(); +#endif + for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex) { DWORD slot = 0; @@ -604,7 +612,7 @@ INLINE static T RoundDownEven(T value) } ////////////////////////////////////////////////////////////////////////// -/// Pack pairs of simdvertexes into simd16vertexes, in-place +/// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping /// /// vertexCount is in terms of the source simdvertexes and must be even /// @@ -612,10 +620,10 @@ INLINE static T RoundDownEven(T value) /// /// note: the stride between vertexes is determinded by KNOB_NUM_ATTRIBUTES /// -void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount) +void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount) { SWR_ASSERT(vertex); - SWR_ASSERT(IsEven(vertexCount)); + SWR_ASSERT(vertex_simd16); SWR_ASSERT(attribCount <= KNOB_NUM_ATTRIBUTES); simd16vertex temp; @@ -626,14 +634,18 @@ void PackPairsOfSimdVertexIntoSimd16VertexInPlace(simdvertex *vertex, uint32_t v { for (uint32_t k = 0; k < 4; k += 1) { - temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); - temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); + temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0); + + if ((i + 1) < vertexCount) + { + temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1); + } } } for (uint32_t j = 0; j < attribCount; j += 1) { - reinterpret_cast(vertex)[i >> 1].attrib[j] = temp.attrib[j]; + vertex_simd16[i >> 1].attrib[j] = temp.attrib[j]; } } } @@ -704,17 +716,16 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num THREAD SWR_GS_CONTEXT tlsGsContext; +#if USE_SIMD16_FRONTEND +THREAD simd16vertex tempVertex_simd16[128]; + +#endif template struct GsBufferInfo { GsBufferInfo(const SWR_GS_STATE &gsState) { -#if USE_SIMD16_FRONTEND - // TEMPORARY: pad up to multiple of two, to support in-place conversion from simdvertex to simd16vertex - const uint32_t vertexCount = RoundUpEven(gsState.maxNumVerts); -#else const uint32_t vertexCount = gsState.maxNumVerts; -#endif const uint32_t vertexStride = sizeof(SIMDVERTEX); const uint32_t numSimdBatches = (vertexCount + SIMD_WIDTH - 1) / SIMD_WIDTH; @@ -896,18 +907,19 @@ static void GeometryShaderStage( } #if USE_SIMD16_FRONTEND - // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex, in-place + // TEMPORARY: GS outputs simdvertex, PA inputs simd16vertex, so convert simdvertex to simd16vertex - const uint32_t attribCount = VERTEX_ATTRIB_START_SLOT + pState->numInputAttribs; + SWR_ASSERT(numEmittedVerts <= 256); - PackPairsOfSimdVertexIntoSimd16VertexInPlace( - reinterpret_cast(pBase), - RoundUpEven(numEmittedVerts), // simd8 -> simd16 - attribCount); + PackPairsOfSimdVertexIntoSimd16Vertex( + tempVertex_simd16, + reinterpret_cast(pBase), + numEmittedVerts, + KNOB_NUM_ATTRIBUTES); #endif #if USE_SIMD16_FRONTEND - PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); + PA_STATE_CUT gsPa(pDC, reinterpret_cast(tempVertex_simd16), numEmittedVerts, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); #else PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); @@ -932,7 +944,22 @@ static void GeometryShaderStage( if (HasStreamOutT::value) { +#if USE_SIMD16_FRONTEND + const uint32_t numPrims = gsPa.NumPrims(); + const uint32_t numPrims_lo = std::min(numPrims, KNOB_SIMD_WIDTH); + const uint32_t numPrims_hi = std::max(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; + + gsPa.useAlternateOffset = false; + StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_lo, stream); + + if (numPrims_hi) + { + gsPa.useAlternateOffset = true; + StreamOut(pDC, gsPa, workerId, pSoPrimData, numPrims_hi, stream); + } +#else StreamOut(pDC, gsPa, workerId, pSoPrimData, stream); +#endif } if (HasRastT::value && state.soState.streamToRasterizer == stream) @@ -1349,7 +1376,18 @@ static void TessellationStages( { if (HasStreamOutT::value) { +#if USE_SIMD16_FRONTEND + tessPa.useAlternateOffset = false; + StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_lo, 0); + + if (numPrims_hi) + { + tessPa.useAlternateOffset = true; + StreamOut(pDC, tessPa, workerId, pSoPrimData, numPrims_hi, 0); + } +#else StreamOut(pDC, tessPa, workerId, pSoPrimData, 0); +#endif } if (HasRastT::value) @@ -1487,7 +1525,11 @@ void ProcessDraw( void* pStreamCutBuffer = nullptr; if (HasGeometryShaderT::value) { +#if USE_SIMD16_FRONTEND + AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer); +#else AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer); +#endif } if (HasTessellationT::value) @@ -1638,9 +1680,9 @@ void ProcessDraw( // copy SIMD vout_lo to lo part of SIMD16 vout { - const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes; + const uint32_t attribCount = sizeof(vout.attrib) / sizeof(vout.attrib[0]); - for (uint32_t i = 0; i < voutNumSlots; i += 1) + for (uint32_t i = 0; i < attribCount; i += 1) { for (uint32_t j = 0; j < 4; j += 1) { @@ -1655,9 +1697,9 @@ void ProcessDraw( // copy SIMD vout_hi to hi part of SIMD16 vout { - const uint32_t voutNumSlots = VERTEX_ATTRIB_START_SLOT + state.feNumAttributes; + const uint32_t attribCount = sizeof(vout.attrib) / sizeof(vout.attrib[0]); - for (uint32_t i = 0; i < voutNumSlots; i += 1) + for (uint32_t i = 0; i < attribCount; i += 1) { for (uint32_t j = 0; j < 4; j += 1) { @@ -1732,8 +1774,19 @@ void ProcessDraw( // If streamout is enabled then stream vertices out to memory. if (HasStreamOutT::value) { +#if 1 + pa.useAlternateOffset = false; + StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_lo, 0); + + if (numPrims_hi) + { + pa.useAlternateOffset = true; + StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0); + } +#else pa.useAlternateOffset = false; // StreamOut() is SIMD16-compatible.. StreamOut(pDC, pa, workerId, pSoPrimData, 0); +#endif } if (HasRastT::value) -- 2.7.4