swr/rasterizer: cleanups for tessellation
authorJan Zielinski <jan.zielinski@intel.com>
Wed, 24 Jul 2019 10:10:27 +0000 (12:10 +0200)
committerJan Zielinski <jan.zielinski@intel.com>
Tue, 30 Jul 2019 13:39:18 +0000 (13:39 +0000)
This commit introduces small fixes in preparation for tessellation
support.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/state.h

index 5eda4d7..816b84e 100644 (file)
@@ -583,8 +583,9 @@ static void StreamOut(
     {
         if (state.soBuffer[i].pWriteOffset)
         {
-            bool nullTileAccessed = false;
-            void* pWriteOffset = pDC->pContext->pfnTranslateGfxptrForWrite(GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
+            bool  nullTileAccessed = false;
+            void* pWriteOffset     = pDC->pContext->pfnTranslateGfxptrForWrite(
+                GetPrivateState(pDC), soContext.pBuffer[i]->pWriteOffset, &nullTileAccessed);
             *((uint32_t*)pWriteOffset) = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
         }
 
@@ -786,21 +787,20 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t
         {
             auto attribGatherX = SIMD_T::mask_i32gather_ps(
                 SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
-            auto attribGatherY = SIMD_T::mask_i32gather_ps(
-                SIMD_T::setzero_ps(),
-                (const float*)(pSrcBase + sizeof(float)),
-                vGatherOffsets,
-                vMask);
-            auto attribGatherZ = SIMD_T::mask_i32gather_ps(
-                SIMD_T::setzero_ps(),
-                (const float*)(pSrcBase + sizeof(float) * 2),
-                vGatherOffsets,
-                vMask);
-            auto attribGatherW = SIMD_T::mask_i32gather_ps(
-                SIMD_T::setzero_ps(),
-                (const float*)(pSrcBase + sizeof(float) * 3),
-                vGatherOffsets,
-                vMask);
+            auto attribGatherY = SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+                                                           (const float*)(pSrcBase + sizeof(float)),
+                                                           vGatherOffsets,
+                                                           vMask);
+            auto attribGatherZ =
+                SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+                                          (const float*)(pSrcBase + sizeof(float) * 2),
+                                          vGatherOffsets,
+                                          vMask);
+            auto attribGatherW =
+                SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
+                                          (const float*)(pSrcBase + sizeof(float) * 3),
+                                          vGatherOffsets,
+                                          vMask);
 
             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
@@ -1235,10 +1235,12 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
 struct TessellationThreadLocalData
 {
     SWR_HS_CONTEXT hsContext;
-    ScalarPatch    patchData[KNOB_SIMD_WIDTH];
     void*          pTxCtx;
     size_t         tsCtxSize;
 
+    uint8_t*    pHSOutput;
+    size_t      hsOutputAllocSize;
+
     simdscalar* pDSOutput;
     size_t      dsOutputAllocSize;
 };
@@ -1340,9 +1342,9 @@ static void TessellationStages(DRAW_CONTEXT* pDC,
     }
 
 #endif
-    SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
-    hsContext.pCPout          = gt_pTessellationThreadData->patchData;
-    hsContext.PrimitiveID     = primID;
+    SWR_HS_CONTEXT& hsContext       = gt_pTessellationThreadData->hsContext;
+    hsContext.PrimitiveID           = primID;
+    hsContext.outputSize = tsState.hsAllocationSize;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
     // Max storage for one attribute for an entire simdprimitive
@@ -1351,17 +1353,29 @@ static void TessellationStages(DRAW_CONTEXT* pDC,
     // assemble all attributes for the input primitives
     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
     {
-        uint32_t attribSlot = tsState.vertexAttribOffset + slot;
+        uint32_t attribSlot = tsState.srcVertexAttribOffset + slot;
         pa.Assemble(attribSlot, simdattrib);
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            hsContext.vert[i].attrib[VERTEX_ATTRIB_START_SLOT + slot] = simdattrib[i];
+            hsContext.vert[i].attrib[tsState.vertexAttribOffset + slot] = simdattrib[i];
         }
     }
 
+    // Allocate HS output storage
+    uint32_t requiredAllocSize = KNOB_SIMD_WIDTH * tsState.hsAllocationSize;
+
+    if (requiredAllocSize > gt_pTessellationThreadData->hsOutputAllocSize)
+    {
+        AlignedFree(gt_pTessellationThreadData->pHSOutput);
+        gt_pTessellationThreadData->pHSOutput = (uint8_t*)AlignedMalloc(requiredAllocSize, 64);
+        gt_pTessellationThreadData->hsOutputAllocSize = requiredAllocSize;
+    }
+
+    hsContext.pCPout = (ScalarPatch*)gt_pTessellationThreadData->pHSOutput;
+
 #if defined(_DEBUG)
-    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
+    //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
 #endif
 
 #if USE_SIMD16_FRONTEND
@@ -1383,10 +1397,15 @@ static void TessellationStages(DRAW_CONTEXT* pDC,
 
     for (uint32_t p = 0; p < numPrims; ++p)
     {
+        ScalarPatch* pCPout = (ScalarPatch*)(gt_pTessellationThreadData->pHSOutput + tsState.hsAllocationSize * p);
+
+        SWR_TESSELLATION_FACTORS tessFactors;
+        tessFactors                    = hsContext.pCPout[p].tessFactors;
+
         // Run Tessellator
         SWR_TS_TESSELLATED_DATA tsData = {0};
         RDTSC_BEGIN(pDC->pContext->pBucketMgr, FETessellation, pDC->drawId);
-        TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
+        TSTessellate(tsCtx, tessFactors, tsData);
         AR_EVENT(TessPrimCount(1));
         RDTSC_END(pDC->pContext->pBucketMgr, FETessellation, 0);
 
@@ -1423,7 +1442,7 @@ static void TessellationStages(DRAW_CONTEXT* pDC,
         // Run Domain Shader
         SWR_DS_CONTEXT dsContext;
         dsContext.PrimitiveID           = pPrimId[p];
-        dsContext.pCpIn                 = &hsContext.pCPout[p];
+        dsContext.pCpIn                 = pCPout;
         dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
         dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
         dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
index b6734e2..8b24c43 100644 (file)
@@ -169,8 +169,8 @@ enum SWR_INNER_TESSFACTOR_ID
 enum SWR_OUTER_TESSFACTOR_ID
 {
     SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL,
-    SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY,
-    SWR_QUAD_U_EQ1_TRI_W,
+    SWR_QUAD_U_EQ1_TRI_V_LINE_DENSITY,
+    SWR_QUAD_V_EQ0_TRI_W,
     SWR_QUAD_V_EQ1,
 
     SWR_NUM_OUTER_TESS_FACTORS,
@@ -281,8 +281,11 @@ struct SWR_TESSELLATION_FACTORS
 {
     float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
     float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
+    float pad[2];
 };
 
+SWR_STATIC_ASSERT(sizeof(SWR_TESSELLATION_FACTORS) == 32);
+
 #define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
 struct ScalarPatch
 {
@@ -300,6 +303,7 @@ struct SWR_HS_CONTEXT
     simdvertex       vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
     simdscalari      PrimitiveID;                  // IN: (SIMD) primitive ID generated from the draw call
     simdscalari      mask;                         // IN: Active mask for shader
+    uint32_t         outputSize;                   // IN: Size of HS output (per lane)
     ScalarPatch*     pCPout;                       // OUT: Output control point patch SIMD-sized-array of SCALAR patches
     SWR_SHADER_STATS stats;                        // OUT: shader statistics used for archrast.
 };
@@ -818,11 +822,16 @@ struct SWR_TS_STATE
 
     uint32_t numHsInputAttribs;
     uint32_t numHsOutputAttribs;
+    uint32_t hsAllocationSize; // Size of HS output in bytes, per lane
+
     uint32_t numDsOutputAttribs;
     uint32_t dsAllocationSize;
     uint32_t dsOutVtxAttribOffset;
 
     // Offset to the start of the attributes of the input vertices, in simdvector units
+    uint32_t srcVertexAttribOffset;
+
+    // Offset to the start of the attributes expected by the hull shader
     uint32_t vertexAttribOffset;
 };