From a6237e4b7fa4c14766b15fb3c638dce1e4b12ad9 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Fri, 9 Jun 2017 18:37:27 -0500 Subject: [PATCH] swr/rast: Fix read-back of viewport array index Binner/clipper read viewport array index from the vertex header as needed. Move viewport state to BACKEND_STATE. Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/rasterizer/core/api.cpp | 4 +- src/gallium/drivers/swr/rasterizer/core/binner.cpp | 129 +++++++++++++++++---- src/gallium/drivers/swr/rasterizer/core/clip.cpp | 24 ++-- src/gallium/drivers/swr/rasterizer/core/clip.h | 63 +++++++--- src/gallium/drivers/swr/rasterizer/core/context.h | 4 +- .../drivers/swr/rasterizer/core/frontend.cpp | 49 +------- src/gallium/drivers/swr/rasterizer/core/frontend.h | 8 +- src/gallium/drivers/swr/rasterizer/core/state.h | 4 +- src/gallium/drivers/swr/swr_shader.cpp | 2 - src/gallium/drivers/swr/swr_state.cpp | 12 +- 10 files changed, 182 insertions(+), 117 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index eacce1c..ae9ced2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -680,7 +680,7 @@ void SwrSetBlendFunc( // update guardband multipliers for the viewport void updateGuardbands(API_STATE *pState) { - uint32_t numGbs = pState->backendState.readRenderTargetArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; + uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; for(uint32_t i = 0; i < numGbs; ++i) { @@ -736,7 +736,7 @@ void SwrSetScissorRects( void SetupMacroTileScissors(DRAW_CONTEXT *pDC) { API_STATE *pState = &pDC->pState->state; - uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; + uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; pState->scissorsTileAligned = true; for (uint32_t index = 0; index < numScissors; ++index) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index a73816b..036d8b1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -434,8 +434,7 @@ void BinTriangles( uint32_t workerId, simdvector tri[3], uint32_t triMask, - simdscalari primID, - simdscalari viewportIdx) + simdscalari primID) { SWR_CONTEXT *pContext = pDC->pContext; @@ -451,6 +450,21 @@ void BinTriangles( simdscalar vRecipW1 = _simd_set1_ps(1.0f); simdscalar vRecipW2 = _simd_set1_ps(1.0f); + // Read viewport array index if needed + simdscalari viewportIdx = _simd_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simdvector vpiAttrib[3]; + pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + + // OOB indices => forced to zero. + simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = _simd_max_epi32(_simd_setzero_si(), vpai); + simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd_and_si(vClearMask, vpai); + } + if (feState.vpTransformDisable) { // RHW is passed in directly when VP transform is disabled @@ -478,7 +492,7 @@ void BinTriangles( tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); // Viewport transform to screen space coords - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { viewportTransform<3>(tri, state.vpMatrices, viewportIdx); } @@ -661,7 +675,7 @@ void BinTriangles( // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); @@ -863,8 +877,7 @@ void SIMDAPI BinTriangles_simd16( uint32_t workerId, simd16vector tri[3], uint32_t triMask, - simd16scalari primID, - simd16scalari viewportIdx) + simd16scalari primID) { SWR_CONTEXT *pContext = pDC->pContext; @@ -880,6 +893,20 @@ void SIMDAPI BinTriangles_simd16( simd16scalar vRecipW0 = _simd16_set1_ps(1.0f); simd16scalar vRecipW1 = _simd16_set1_ps(1.0f); simd16scalar vRecipW2 = _simd16_set1_ps(1.0f); + + simd16scalari viewportIdx = _simd16_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simd16vector vpiAttrib[3]; + pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); + + // OOB indices => forced to zero. + simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai); + simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd16_and_si(vClearMask, vpai); + } if (feState.vpTransformDisable) { @@ -908,7 +935,7 @@ void SIMDAPI BinTriangles_simd16( tri[2].v[2] = _simd16_mul_ps(tri[2].v[2], vRecipW2); // Viewport transform to screen space coords - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { viewportTransform<3>(tri, state.vpMatrices, viewportIdx); } @@ -1101,7 +1128,7 @@ void SIMDAPI BinTriangles_simd16( /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); @@ -1524,7 +1551,7 @@ void BinPostSetupPoints( // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); @@ -1672,8 +1699,7 @@ void BinPoints( uint32_t workerId, simdvector prim[3], uint32_t primMask, - simdscalari primID, - simdscalari viewportIdx) + simdscalari primID) { simdvector& primVerts = prim[0]; @@ -1681,6 +1707,21 @@ void BinPoints( const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_RASTSTATE& rastState = state.rastState; + // Read back viewport index if required + simdscalari viewportIdx = _simd_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simdvector vpiAttrib[1]; + pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + + // OOB indices => forced to zero. + vpai = _simd_max_epi32(_simd_setzero_si(), vpai); + simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd_and_si(vClearMask, vpai); + } + if (!feState.vpTransformDisable) { // perspective divide @@ -1690,7 +1731,7 @@ void BinPoints( primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); // viewport transform to screen coords - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); } @@ -1898,7 +1939,7 @@ void BinPostSetupPoints_simd16( // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); @@ -2040,8 +2081,7 @@ void SIMDAPI BinPoints_simd16( uint32_t workerId, simd16vector prim[3], uint32_t primMask, - simd16scalari primID, - simd16scalari viewportIdx) + simd16scalari primID) { simd16vector& primVerts = prim[0]; @@ -2049,6 +2089,21 @@ void SIMDAPI BinPoints_simd16( const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_RASTSTATE& rastState = state.rastState; + // Read back viewport index if required + simd16scalari viewportIdx = _simd16_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simd16vector vpiAttrib[1]; + pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); + + // OOB indices => forced to zero. + simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai) + simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd16_and_si(vClearMask, vpai); + } + if (!feState.vpTransformDisable) { // perspective divide @@ -2059,7 +2114,7 @@ void SIMDAPI BinPoints_simd16( primVerts.z = _simd16_mul_ps(primVerts.z, vRecipW0); // viewport transform to screen coords - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); } @@ -2165,7 +2220,7 @@ void BinPostSetupLines( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. simdscalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); @@ -2370,7 +2425,7 @@ void BinPostSetupLines_simd16( // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. simd16scalari scisXmin, scisYmin, scisXmax, scisYmax; - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { GatherScissors_simd16::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); @@ -2533,8 +2588,7 @@ void BinLines( uint32_t workerId, simdvector prim[], uint32_t primMask, - simdscalari primID, - simdscalari viewportIdx) + simdscalari primID) { const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; @@ -2542,6 +2596,20 @@ void BinLines( simdscalar vRecipW[2] = { _simd_set1_ps(1.0f), _simd_set1_ps(1.0f) }; + simdscalari viewportIdx = _simd_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simdvector vpiAttrib[2]; + pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = _simd_max_epi32(_simd_setzero_si(), vpai); + + // OOB indices => forced to zero. + simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd_and_si(vClearMask, vpai); + } + if (!feState.vpTransformDisable) { // perspective divide @@ -2558,7 +2626,7 @@ void BinLines( prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW[1]); // viewport transform to screen coords - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { viewportTransform<2>(prim, state.vpMatrices, viewportIdx); } @@ -2594,8 +2662,7 @@ void SIMDAPI BinLines_simd16( uint32_t workerId, simd16vector prim[3], uint32_t primMask, - simd16scalari primID, - simd16scalari viewportIdx) + simd16scalari primID) { const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; @@ -2603,6 +2670,20 @@ void SIMDAPI BinLines_simd16( simd16scalar vRecipW[2] = { _simd16_set1_ps(1.0f), _simd16_set1_ps(1.0f) }; + simd16scalari viewportIdx = _simd16_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simd16vector vpiAttrib[2]; + pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); + + // OOB indices => forced to zero. + simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + vpai = _simd16_max_epi32(_simd16_setzero_si(), vpai); + simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd16_and_si(vClearMask, vpai); + } + if (!feState.vpTransformDisable) { // perspective divide @@ -2619,7 +2700,7 @@ void SIMDAPI BinLines_simd16( prim[1].v[2] = _simd16_mul_ps(prim[1].v[2], vRecipW[1]); // viewport transform to screen coords - if (state.gsState.emitsViewportArrayIndex) + if (state.backendState.readViewportArrayIndex) { viewportTransform<2>(prim, state.vpMatrices, viewportIdx); } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index c93e0fb..bd62b58 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -160,35 +160,35 @@ int ClipTriToPlane( const float *pInPts, int numInPts, return i; } -void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) +void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipTriangles, pDC->drawId); Clipper<3> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); + clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipTriangles, 1); } -void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) +void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipLines, pDC->drawId); Clipper<2> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); + clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipLines, 1); } -void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) +void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipPoints, pDC->drawId); Clipper<1> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); + clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipPoints, 1); } #if USE_SIMD16_FRONTEND -void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx) +void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipTriangles, pDC->drawId); @@ -198,12 +198,12 @@ void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t work Clipper clipper(workerId, pDC); pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); + clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipTriangles, 1); } -void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx) +void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipLines, pDC->drawId); @@ -213,12 +213,12 @@ void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId Clipper clipper(workerId, pDC); pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); + clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipLines, 1); } -void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx) +void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEClipPoints, pDC->drawId); @@ -228,7 +228,7 @@ void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerI Clipper clipper(workerId, pDC); pa.useAlternateOffset = false; - clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); + clipper.ExecuteStage(pa, prims, primMask, primId); AR_END(FEClipPoints, 1); } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 9235618..12b52c5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -459,7 +459,7 @@ public: #endif // clip SIMD primitives - void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx) + void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId) { // input/output vertex store for clipper simdvertex vertices[7]; // maximum 7 verts generated per triangle @@ -559,7 +559,6 @@ public: uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; - uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx; const simdscalari vOffsets = _mm256_set_epi32( 0 * sizeof(simdvertex), // unused lane @@ -697,7 +696,7 @@ public: } clipPa.useAlternateOffset = false; - pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim])); + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim])); } #else simdvector attrib[NumVertsPerPrim]; @@ -705,7 +704,7 @@ public: if (assemble) { static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; - pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim])); + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim])); } #endif } while (clipPa.NextPrim()); @@ -717,7 +716,7 @@ public: } #if USE_SIMD16_FRONTEND - void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId, const simd16scalari& vViewportIdx) + void ClipSimd(const simd16scalar& vPrimMask, const simd16scalar& vClipMask, PA_STATE& pa, const simd16scalari& vPrimId) { // input/output vertex store for clipper simd16vertex vertices[7]; // maximum 7 verts generated per triangle @@ -817,7 +816,6 @@ public: uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; - uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx; const simdscalari vOffsets = _simd_set_epi32( 0 * sizeof(simd16vertex), // unused lane @@ -928,7 +926,7 @@ public: static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff }; clipPa.useAlternateOffset = false; - pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim]), _simd16_set1_epi32(pViewportIdx[inputPrim])); + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd16_set1_epi32(pPrimitiveId[inputPrim])); } } while (clipPa.NextPrim()); @@ -945,7 +943,7 @@ public: #endif // execute the clipper stage - void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) + void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId) { SWR_ASSERT(this->pDC != nullptr); SWR_CONTEXT* pContext = this->pDC->pContext; @@ -973,6 +971,20 @@ public: // update clipper invocations pipeline stat uint32_t numInvoc = _mm_popcnt_u32(primMask); UPDATE_STAT_FE(CInvocations, numInvoc); + + // Read back viewport index if required + simdscalari viewportIdx = _simd_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simdvector vpiAttrib[NumVertsPerPrim]; + pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + + // OOB indices => forced to zero. + simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd_and_si(vClearMask, vpai); + } ComputeClipCodes(prim, viewportIdx); @@ -1001,7 +1013,7 @@ public: AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); // we have to clip tris, execute the clipper, which will also // call the binner - ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx); + ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); AR_END(FEGuardbandClip, 1); } else if (validMask) @@ -1010,12 +1022,12 @@ public: UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); // forward valid prims directly to binner - pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); + pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); } } #if USE_SIMD16_FRONTEND - void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx) + void ExecuteStage(PA_STATE& pa, simd16vector prim[], uint32_t primMask, simd16scalari primId) { SWR_ASSERT(pa.pDC != nullptr); SWR_CONTEXT* pContext = pa.pDC->pContext; @@ -1043,6 +1055,19 @@ public: uint32_t numInvoc = _mm_popcnt_u32(primMask); UPDATE_STAT_FE(CInvocations, numInvoc); + // Read back viewport index if required + simd16scalari viewportIdx = _simd16_set1_epi32(0); + if (state.backendState.readViewportArrayIndex) + { + simd16vector vpiAttrib[NumVertsPerPrim]; + pa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); + + // OOB indices => forced to zero. + simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); + viewportIdx = _simd16_and_si(vClearMask, vpai); + } ComputeClipCodes(prim, viewportIdx); // cull prims with NAN coords @@ -1070,7 +1095,7 @@ public: AR_BEGIN(FEGuardbandClip, pa.pDC->drawId); // we have to clip tris, execute the clipper, which will also // call the binner - ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId, viewportIdx); + ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId); AR_END(FEGuardbandClip, 1); } else if (validMask) @@ -1079,7 +1104,7 @@ public: UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); // forward valid prims directly to binner - pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); + pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); } } @@ -1854,12 +1879,12 @@ private: // pipeline stage functions -void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); -void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); -void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); +void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); +void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); +void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); #if USE_SIMD16_FRONTEND -void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx); -void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx); -void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx); +void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId); +void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId); +void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId); #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index f60ddfd..81bf9ff 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -214,12 +214,12 @@ struct PA_STATE; // function signature for pipeline stages that execute after primitive assembly typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], - uint32_t primMask, simdscalari primID, simdscalari viewportIdx); + uint32_t primMask, simdscalari primID); #if ENABLE_AVX512_SIMD16 // function signature for pipeline stages that execute after primitive assembly typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], - uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx); + uint32_t primMask, simd16scalari primID); #endif OSALIGNLINE(struct) API_STATE diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index c11a35a..1cd166d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -950,48 +950,11 @@ static void GeometryShaderStage( #if USE_SIMD16_FRONTEND simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]); - // use viewport array index if GS declares it as an output attribute. Otherwise use index 0. - simd16scalari vViewPortIdx; - if (state.gsState.emitsViewportArrayIndex) - { - simd16vector vpiAttrib[3]; - gsPa.Assemble_simd16(VERTEX_SGV_SLOT, vpiAttrib); - - // OOB indices => forced to zero. - simd16scalari vpai = _simd16_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - simd16scalari vNumViewports = _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simd16scalari vClearMask = _simd16_cmplt_epi32(vpai, vNumViewports); - vViewPortIdx = _simd16_and_si(vClearMask, vpai); - } - else - { - vViewPortIdx = _simd16_set1_epi32(0); - } - gsPa.useAlternateOffset = false; - pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx); + pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId); #else simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); - - // use viewport array index if GS declares it as an output attribute. Otherwise use index 0. - simdscalari vViewPortIdx; - if (state.gsState.emitsViewportArrayIndex) - { - simdvector vpiAttrib[3]; - gsPa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); - simdscalari vpai = _simd_castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - - // OOB indices => forced to zero. - simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); - simdscalari vClearMask = _simd_cmplt_epi32(vpai, vNumViewports); - vViewPortIdx = _simd_and_si(vClearMask, vpai); - } - else - { - vViewPortIdx = _simd_set1_epi32(0); - } - - pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx); + pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); #endif } } @@ -1340,10 +1303,10 @@ static void TessellationStages( SWR_ASSERT(pfnClipFunc); #if USE_SIMD16_FRONTEND tessPa.useAlternateOffset = false; - pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0)); + pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID); #else pfnClipFunc(pDC, tessPa, workerId, prim, - GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0)); + GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID)); #endif } } @@ -1702,7 +1665,7 @@ void ProcessDraw( SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16); pa.useAlternateOffset = false; - pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si()); + pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID); } } } @@ -1864,7 +1827,7 @@ void ProcessDraw( SWR_ASSERT(pDC->pState->pfnProcessPrims); pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, - GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0)); + GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID)); } } } diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 65b7f02..3c2361e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -388,10 +388,10 @@ PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative); #endif struct PA_STATE_BASE; // forward decl -void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); -void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); +void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); +void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); #if USE_SIMD16_FRONTEND -void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx); -void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx); +void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID); +void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID); #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 94a5071..2440d44 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -710,9 +710,6 @@ struct SWR_GS_STATE // instance count uint32_t instanceCount; - // geometry shader emits ViewportArrayIndex - bool emitsViewportArrayIndex; - // if true, geometry shader emits a single stream, with separate cut buffer. // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer // to map vertices to streams @@ -1049,6 +1046,7 @@ struct SWR_BACKEND_STATE SWR_ATTRIB_SWIZZLE swizzleMap[32]; bool readRenderTargetArrayIndex; // Forward render target array index from last FE stage to the backend + bool readViewportArrayIndex; // Read viewport array index from last FE stage during binning }; diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index f4029be..dfc54fa 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -547,8 +547,6 @@ BuilderSWR::CompileGS(struct swr_context *ctx, swr_jit_gs_key &key) pGS->maxNumVerts = info->properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; pGS->instanceCount = info->properties[TGSI_PROPERTY_GS_INVOCATIONS]; - pGS->emitsViewportArrayIndex = info->writes_viewport_index; - // XXX: single stream for now... pGS->isSingleStream = true; pGS->singleStreamID = 0; diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index 19d961f..c87393c 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -1755,12 +1755,12 @@ swr_update_derived(struct pipe_context *pipe, (ctx->rasterizer->flatshade ? ctx->fs->flatConstantMask : 0); backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask; - if (ctx->gs) - backendState.readRenderTargetArrayIndex = - ctx->gs->info.base.writes_layer; - else - backendState.readRenderTargetArrayIndex = - ctx->vs->info.base.writes_layer; + struct tgsi_shader_info *pLastFE = + ctx->gs ? + &ctx->gs->info.base : + &ctx->vs->info.base; + backendState.readRenderTargetArrayIndex = pLastFE->writes_layer; + backendState.readViewportArrayIndex = pLastFE->writes_viewport_index; SwrSetBackendState(ctx->swrContext, &backendState); -- 2.7.4