swr/rast: simdlib cleanup, clipper stack space fixes

author Alok Hota <alok.hota@intel.com>

Thu, 13 Sep 2018 21:12:12 +0000 (16:12 -0500)

committer Alok Hota <alok.hota@intel.com>

Fri, 15 Feb 2019 20:54:23 +0000 (14:54 -0600)
author Alok Hota <alok.hota@intel.com>
Thu, 13 Sep 2018 21:12:12 +0000 (16:12 -0500)
committer Alok Hota <alok.hota@intel.com>
Fri, 15 Feb 2019 20:54:23 +0000 (14:54 -0600)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl

index 0c5795c..9d190bc 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@@ -415,7 +415,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
  //-----------------------------------------------------------------------
  // Load / store operations
  //-----------------------------------------------------------------------
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -462,7 +462,7 @@ static SIMDINLINE Integer SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
  {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl

index 35f9175..0da66eb 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@@ -48,7 +48,7 @@ static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return
      return _mm_srlv_epi32(vA, vB);
  }
  
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -56,7 +56,7 @@ static SIMDINLINE Float SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
  {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl

index 2ce3caa..b076daa 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -276,7 +276,7 @@ static SIMDINLINE Integer SIMDCALL
      return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
  }
  
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -285,7 +285,7 @@ static SIMDINLINE Float SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
  {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl

index 4ac0f95..232f43f 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -574,7 +574,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
  //-----------------------------------------------------------------------
  // Load / store operations
  //-----------------------------------------------------------------------
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -591,6 +591,13 @@ static SIMDINLINE Float SIMDCALL
      return vResult;
  }
  
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return i32gather_ps<ScaleT>(p, idx);
+}
+
  static SIMDINLINE Float SIMDCALL
                          load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
  {
@@ -621,7 +628,7 @@ static SIMDINLINE Integer SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  {
@@ -641,6 +648,13 @@ static SIMDINLINE Float SIMDCALL
      return vResult;
  }
  
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
+{
+    return mask_i32gather_ps<ScaleT>(old, p, idx, mask);
+}
+
  static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
  {
      _mm256_maskstore_ps(p, mask, src);
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl

index 59a61cf..49650d5 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -206,7 +206,7 @@ SIMD_IWRAPPER_2(unpacklo_epi8);
  //-----------------------------------------------------------------------
  // Load / store operations
  //-----------------------------------------------------------------------
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -214,7 +214,7 @@ static SIMDINLINE Float SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl

index 7906098..4c883b1 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -277,7 +277,7 @@ static SIMDINLINE Integer SIMDCALL
      return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
  }
  
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -286,7 +286,7 @@ static SIMDINLINE Float SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
  {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl

index e9e908a..5053275 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -492,7 +492,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
  //-----------------------------------------------------------------------
  // Load / store operations
  //-----------------------------------------------------------------------
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -529,11 +529,11 @@ static SIMDINLINE Integer SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
  {
-    __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
+    __mmask16 k = _mm512_test_epi32_mask(castps_si(mask), set1_epi32(0x80000000));
  
      return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
  }
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl

index 91705f2..f25d834 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -631,7 +631,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
  //-----------------------------------------------------------------------
  // Load / store operations
  //-----------------------------------------------------------------------
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
  {
@@ -641,6 +641,16 @@ static SIMDINLINE Float SIMDCALL
      };
  }
  
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+                        sw_i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float{
+        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template sw_i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
  static SIMDINLINE Float SIMDCALL
                          load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
  {
@@ -677,7 +687,7 @@ static SIMDINLINE Integer SIMDCALL
  }
  
  // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template <ScaleFactor ScaleT>
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
  static SIMDINLINE Float SIMDCALL
                          mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
  {
@@ -687,6 +697,16 @@ static SIMDINLINE Float SIMDCALL
      };
  }
  
+template <ScaleFactor ScaleT = ScaleFactor::SF_1>
+static SIMDINLINE Float SIMDCALL
+                        sw_mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
+{
+    return Float{
+        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template sw_mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
  static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
  {
      SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp

index 7902bcb..85c722c 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@@ -273,7 +273,7 @@ struct SIMD256 // or SIMD4 or SIMD16
          SF_8,   // Scale offset by 8
      };
  
-    template<ScaleFactor ScaleT>
+    template<ScaleFactor ScaleT = ScaleFactor::SF_1>
      static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
      static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
      static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp

index 8c53fca..87be5bc 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -31,12 +31,6 @@
  #include "common/os.h"
  #include "core/clip.h"
  
-// Temp storage used by the clipper
-THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
-#if USE_SIMD16_FRONTEND
-THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
-#endif
-
  float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
  {
      return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h

index 7b4ed58..33c1653 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -32,12 +32,6 @@
  #include "core/pa.h"
  #include "rdtsc_core.h"
  
-// Temp storage used by the clipper
-extern THREAD SIMDVERTEX_T<SIMD256> tlsTempVertices[7];
-#if USE_SIMD16_FRONTEND
-extern THREAD SIMDVERTEX_T<SIMD512> tlsTempVertices_simd16[7];
-#endif
-
  enum SWR_CLIPCODES
  {
  // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -314,41 +308,36 @@ struct SimdHelper<SIMD512>
          return SIMD512::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>(a, b);
      }
  };
-
  #endif
-// Temp storage used by the clipper
-template <typename SIMD_T>
-struct ClipHelper
-{
-};
-
-template <>
-struct ClipHelper<SIMD256>
-{
-    static SIMDVERTEX_T<SIMD256>* GetTempVertices() { return tlsTempVertices; }
-};
-
-#if USE_SIMD16_FRONTEND
-template <>
-struct ClipHelper<SIMD512>
-{
-    static SIMDVERTEX_T<SIMD512>* GetTempVertices() { return tlsTempVertices_simd16; }
-};
  
-#endif
-template <typename SIMD_T, uint32_t NumVertsPerPrim>
+template <typename SIMD_T, uint32_t NumVertsPerPrimT>
  class Clipper
  {
  public:
      INLINE Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
          workerId(in_workerId), pDC(in_pDC), state(GetApiState(in_pDC))
      {
-        static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
+        static_assert(NumVertsPerPrimT >= 1 && NumVertsPerPrimT <= 3, "Invalid NumVertsPerPrim");
+        THREAD_DATA &thread_data = in_pDC->pContext->threadPool.pThreadData[workerId];
+
+        if (thread_data.clipperData == nullptr)
+        {
+            // 7 vertex temp data
+            // 7 post-clipped vertices
+            // 2 transposed verts for binning
+            size_t alloc_size = sizeof(SIMDVERTEX_T<SIMD_T>) * (7 + 7 + 2);
+            thread_data.clipperData = AlignedMalloc(alloc_size, KNOB_SIMD16_BYTES);
+        }
+        SWR_ASSERT(thread_data.clipperData);
+
+        this->clippedVerts = (SIMDVERTEX_T<SIMD_T>*)thread_data.clipperData;
+        this->tmpVerts = this->clippedVerts + 7;
+        this->transposedVerts = this->tmpVerts + 7;
      }
  
      void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
      {
-        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+        for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
          {
              ::ComputeClipCodes<SIMD_T>(state, vertex[i], clipCodes[i], viewportIndexes);
          }
@@ -358,7 +347,7 @@ public:
      {
          Float<SIMD_T> result = clipCodes[0];
  
-        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+        for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
          {
              result = SIMD_T::and_ps(result, clipCodes[i]);
          }
@@ -370,7 +359,7 @@ public:
      {
          Float<SIMD_T> result = clipCodes[0];
  
-        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+        for (uint32_t i = 1; i < NumVertsPerPrimT; ++i)
          {
              result = SIMD_T::or_ps(result, clipCodes[i]);
          }
@@ -393,7 +382,7 @@ public:
      {
          Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
  
-        for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+        for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
          {
              Float<SIMD_T> vNan01 =
                  SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
@@ -428,7 +417,7 @@ public:
              uint32_t component = index & 0x3;
  
              Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
-            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+            for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
              {
                  Float<SIMD_T> vCullComp;
                  if (slot == 0)
@@ -457,7 +446,7 @@ public:
              uint32_t component = index & 0x3;
  
              Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
-            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+            for (uint32_t e = 0; e < NumVertsPerPrimT; ++e)
              {
                  Float<SIMD_T> vClipComp;
                  if (slot == 0)
@@ -491,7 +480,7 @@ public:
                    const Integer<SIMD_T>& vRtIdx)
      {
          // input/output vertex store for clipper
-        SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
+        SIMDVERTEX_T<SIMD_T>* vertices = this->clippedVerts;
  
          uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
          uint32_t provokingVertex    = 0;
@@ -502,8 +491,8 @@ public:
          ///@todo: line topology for wireframe?
  
          // assemble pos
-        Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
-        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+        Vec4<SIMD_T> tmpVector[NumVertsPerPrimT];
+        for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
          {
              vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
          }
@@ -526,14 +515,14 @@ public:
              // vertex values to all edges
              if (CheckBit(constantInterpMask, slot))
              {
-                for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+                for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
                  {
                      vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
                  }
              }
              else
              {
-                for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+                for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
                  {
                      vertices[i].attrib[inputSlot] = tmpVector[i];
                  }
@@ -545,7 +534,7 @@ public:
          if (state.backendState.clipDistanceMask & 0xf)
          {
              pa.Assemble(vertexClipCullSlot, tmpVector);
-            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
              {
                  vertices[i].attrib[vertexClipCullSlot] = tmpVector[i];
              }
@@ -554,7 +543,7 @@ public:
          if (state.backendState.clipDistanceMask & 0xf0)
          {
              pa.Assemble(vertexClipCullSlot + 1, tmpVector);
-            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            for (uint32_t i = 0; i < NumVertsPerPrimT; ++i)
              {
                  vertices[i].attrib[vertexClipCullSlot + 1] = tmpVector[i];
              }
@@ -565,12 +554,12 @@ public:
          Integer<SIMD_T> vNumClippedVerts =
              ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
  
-        BinnerChooser<SIMD_T> binner(NumVertsPerPrim,
+        BinnerChooser<SIMD_T> binner(NumVertsPerPrimT,
                                       pa.pDC->pState->state.rastState.conservativeRast);
  
          // set up new PA for binning clipped primitives
          PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
-        if (NumVertsPerPrim == 3)
+        if (NumVertsPerPrimT == 3)
          {
              clipTopology = TOP_TRIANGLE_FAN;
  
@@ -584,7 +573,7 @@ public:
                  clipTopology = TOP_RECT_LIST;
              }
          }
-        else if (NumVertsPerPrim == 2)
+        else if (NumVertsPerPrimT == 2)
          {
              clipTopology = TOP_LINE_LIST;
          }
@@ -614,25 +603,16 @@ public:
  
          uint32_t numClippedPrims = 0;
  
-        // tranpose clipper output so that each lane's vertices are in SIMD order
+        // transpose clipper output so that each lane's vertices are in SIMD order
          // set aside space for 2 vertices, as the PA will try to read up to 16 verts
          // for triangle fan
+        SIMDVERTEX_T<SIMD_T>*  transposedPrims = this->transposedVerts;
  
-#if defined(_DEBUG)
-        // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack
-        // overflow in debug builds
-        SIMDVERTEX_T<SIMD_T>* transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T>*>(
-            AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
-
-#else
-        SIMDVERTEX_T<SIMD_T>  transposedPrims[2];
-
-#endif
          uint32_t              numInputPrims = pa.NumPrims();
          for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
          {
              uint32_t numEmittedVerts = pVertexCount[inputPrim];
-            if (numEmittedVerts < NumVertsPerPrim)
+            if (numEmittedVerts < NumVertsPerPrimT)
              {
                  continue;
              }
@@ -648,27 +628,23 @@ public:
              // for triangle fan
  
              // transpose pos
-            uint8_t* pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
-                             sizeof(float) * inputPrim;
-
-#if 0
-            // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
-            static const float *dummy = reinterpret_cast<const float *>(pBase);
+            float const* pBase =
+                reinterpret_cast<float const*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
+                inputPrim;
  
-#endif
              for (uint32_t c = 0; c < 4; ++c)
              {
-                SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
-                    SIMD256::setzero_ps(), reinterpret_cast<const float*>(pBase), vOffsets, vMask);
+                SIMD256::Float temp =
+                    SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
                  transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
                      SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                pBase += sizeof(Float<SIMD_T>);
+                pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
              }
  
              // transpose attribs
-            pBase =
-                reinterpret_cast<uint8_t*>(&vertices[0].attrib[backendState.vertexAttribOffset]) +
-                sizeof(float) * inputPrim;
+            pBase = reinterpret_cast<float const*>(
+                        &vertices[0].attrib[backendState.vertexAttribOffset]) +
+                    inputPrim;
  
              for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
              {
@@ -677,14 +653,10 @@ public:
                  for (uint32_t c = 0; c < 4; ++c)
                  {
                      SIMD256::Float temp =
-                        SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
-                            SIMD256::setzero_ps(),
-                            reinterpret_cast<const float*>(pBase),
-                            vOffsets,
-                            vMask);
+                        SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
                      transposedPrims[0].attrib[attribSlot][c] =
                          SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase += sizeof(Float<SIMD_T>);
+                    pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
                  }
              }
  
@@ -692,39 +664,32 @@ public:
              uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
              if (state.backendState.clipDistanceMask & 0x0f)
              {
-                pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot]) +
-                        sizeof(float) * inputPrim;
+                pBase = reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot]) +
+                        inputPrim;
  
                  for (uint32_t c = 0; c < 4; ++c)
                  {
                      SIMD256::Float temp =
-                        SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
-                            SIMD256::setzero_ps(),
-                            reinterpret_cast<const float*>(pBase),
-                            vOffsets,
-                            vMask);
+                        SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
                      transposedPrims[0].attrib[vertexClipCullSlot][c] =
                          SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase += sizeof(Float<SIMD_T>);
+                    pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
                  }
              }
  
              if (state.backendState.clipDistanceMask & 0xf0)
              {
-                pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
-                        sizeof(float) * inputPrim;
+                pBase =
+                    reinterpret_cast<float const*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
+                    inputPrim;
  
                  for (uint32_t c = 0; c < 4; ++c)
                  {
                      SIMD256::Float temp =
-                        SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
-                            SIMD256::setzero_ps(),
-                            reinterpret_cast<const float*>(pBase),
-                            vOffsets,
-                            vMask);
+                        SIMD256::mask_i32gather_ps(SIMD256::setzero_ps(), pBase, vOffsets, vMask);
                      transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
                          SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase += sizeof(Float<SIMD_T>);
+                    pBase = PtrAdd(pBase, sizeof(Float<SIMD_T>));
                  }
              }
  
@@ -734,7 +699,7 @@ public:
                                  numEmittedVerts,
                                  SWR_VTX_NUM_SLOTS,
                                  true,
-                                NumVertsPerPrim,
+                                NumVertsPerPrimT,
                                  clipTopology);
              clipPA.viewportArrayActive = pa.viewportArrayActive;
              clipPA.rtArrayActive       = pa.rtArrayActive;
@@ -751,7 +716,7 @@ public:
              {
                  do
                  {
-                    Vec4<SIMD_T> attrib[NumVertsPerPrim];
+                    Vec4<SIMD_T> attrib[NumVertsPerPrimT];
  
                      bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
  
@@ -765,10 +730,6 @@ public:
              }
          }
  
-#if defined(_DEBUG)
-        AlignedFree(transposedPrims);
-
-#endif
          // update global pipeline stat
          UPDATE_STAT_FE(CPrimitives, numClippedPrims);
      }
@@ -811,7 +772,7 @@ public:
  
          // skip clipping for points
          uint32_t clipMask = 0;
-        if (NumVertsPerPrim != 1)
+        if (NumVertsPerPrimT != 1)
          {
              clipMask = validMask & ComputeClipMask();
          }
@@ -905,8 +866,7 @@ private:
          Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
          Float<SIMD_T>   vSrc     = SIMD_T::setzero_ps();
  
-        return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
-            vSrc, pBuffer, vOffsets, vMask);
+        return SIMD_T::mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask);
      }
  
      void ScatterComponent(const float*           pBuffer,
@@ -1278,15 +1238,15 @@ private:
                                int                  numAttribs)
      {
          // temp storage
-        float* pTempVerts = reinterpret_cast<float*>(ClipHelper<SIMD_T>::GetTempVertices());
+        float* pTempVerts = reinterpret_cast<float*>(this->tmpVerts);
  
          // zero out num input verts for non-active lanes
-        Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
+        Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrimT);
          vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
  
          // clip prims to frustum
          Integer<SIMD_T> vNumOutPts;
-        if (NumVertsPerPrim == 3)
+        if (NumVertsPerPrimT == 3)
          {
              vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
              vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
@@ -1300,7 +1260,7 @@ private:
          }
          else
          {
-            SWR_ASSERT(NumVertsPerPrim == 2);
+            SWR_ASSERT(NumVertsPerPrimT == 2);
              vNumOutPts =
                  ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
              vNumOutPts =
@@ -1318,7 +1278,7 @@ private:
          // restore num verts for non-clipped, active lanes
          Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
          vNumOutPts =
-            SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
+            SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrimT), vNonClippedMask);
  
          return vNumOutPts;
      }
@@ -1326,7 +1286,10 @@ private:
      const uint32_t   workerId{0};
      DRAW_CONTEXT*    pDC{nullptr};
      const API_STATE& state;
-    Float<SIMD_T>    clipCodes[NumVertsPerPrim];
+    Float<SIMD_T>    clipCodes[NumVertsPerPrimT];
+    SIMDVERTEX_T<SIMD_T>* clippedVerts;
+    SIMDVERTEX_T<SIMD_T>* tmpVerts;
+    SIMDVERTEX_T<SIMD_T>* transposedVerts;
  };
  
  // pipeline stage functions
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp

index 24db527..6ba6784 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -782,19 +782,19 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t
  
          for (uint32_t a = 0; a < numAttribs; ++a)
          {
-            auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+            auto attribGatherX = SIMD_T::mask_i32gather_ps(
                  SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
-            auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+            auto attribGatherY = SIMD_T::mask_i32gather_ps(
                  SIMD_T::setzero_ps(),
                  (const float*)(pSrcBase + sizeof(float)),
                  vGatherOffsets,
                  vMask);
-            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+            auto attribGatherZ = SIMD_T::mask_i32gather_ps(
                  SIMD_T::setzero_ps(),
                  (const float*)(pSrcBase + sizeof(float) * 2),
                  vGatherOffsets,
                  vMask);
-            auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+            auto attribGatherW = SIMD_T::mask_i32gather_ps(
                  SIMD_T::setzero_ps(),
                  (const float*)(pSrcBase + sizeof(float) * 3),
                  vGatherOffsets,
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h

index d0f4b30..3072bbc 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -45,7 +45,8 @@ struct THREAD_DATA
      uint32_t     numaId;             // NUMA node id
      uint32_t     coreId;             // Core id
      uint32_t     htId;               // Hyperthread id
-    uint32_t     workerId;
+    uint32_t     workerId;           // index of worker in total thread data
+    void*        clipperData;        // pointer to hang clipper-private data on
      SWR_CONTEXT* pContext;
      bool         forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set.
  };
author	Alok Hota <alok.hota@intel.com>
	Thu, 13 Sep 2018 21:12:12 +0000 (16:12 -0500)
committer	Alok Hota <alok.hota@intel.com>
	Fri, 15 Feb 2019 20:54:23 +0000 (14:54 -0600)
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/clip.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/clip.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/frontend.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/threads.h		patch \| blob \| history