swr: fix _BitScanForward64 on unix
authorMichel Zou <xantares09@hotmail.com>
Tue, 15 Sep 2020 19:08:06 +0000 (21:08 +0200)
committerMarge Bot <eric+marge@anholt.net>
Fri, 18 Sep 2020 06:24:00 +0000 (06:24 +0000)
it must apply to 64 bits types, and use the ctzll intrinsic instead of ctz

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Krzysztof Raszkowski <krzysztof.raszkowski@intel.com>
Reviewed-by: Jan Zielinski <jan.zielinski@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6705>

15 files changed:
src/gallium/drivers/swr/rasterizer/common/intrin.h
src/gallium/drivers/swr/rasterizer/common/os.h
src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/backend_impl.h
src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
src/gallium/drivers/swr/rasterizer/core/binner.cpp
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp
src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp

index 4c413ca..95b462b 100644 (file)
@@ -70,7 +70,7 @@ UINT pdep_u32(UINT a, UINT mask)
 
     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
     // using bsf instead of funky loop
-    DWORD maskIndex;
+    unsigned long maskIndex = 0;
     while (_BitScanForward(&maskIndex, mask))
     {
         // 1. isolate lowest set bit of mask
@@ -100,7 +100,7 @@ UINT pext_u32(UINT a, UINT mask)
     return _pext_u32(a, mask);
 #else
     UINT     result = 0;
-    DWORD    maskIndex;
+    unsigned long maskIndex;
     uint32_t currentBit = 0;
     while (_BitScanForward(&maskIndex, mask))
     {
index 14a613e..a37b9f4 100644 (file)
@@ -224,33 +224,30 @@ static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
 #endif
 #endif
 
-inline unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask)
+inline unsigned char _BitScanForward64(unsigned long* Index, uint64_t Mask)
 {
-    *Index = __builtin_ctz(Mask);
+    *Index = __builtin_ctzll(Mask);
     return (Mask != 0);
 }
 
-inline unsigned char _BitScanForward(unsigned int* Index, unsigned int Mask)
+inline unsigned char _BitScanForward(unsigned long* Index, uint32_t Mask)
 {
     *Index = __builtin_ctz(Mask);
     return (Mask != 0);
 }
 
-inline unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask)
+inline unsigned char _BitScanReverse64(unsigned long* Index, uint64_t Mask)
 {
-    *Index = 63 - __builtin_clz(Mask);
+    *Index = 63 - __builtin_clzll(Mask);
     return (Mask != 0);
 }
 
-inline unsigned char _BitScanReverse(unsigned int* Index, unsigned int Mask)
+inline unsigned char _BitScanReverse(unsigned long* Index, uint32_t Mask)
 {
     *Index = 31 - __builtin_clz(Mask);
     return (Mask != 0);
 }
 
-#define _BitScanForward64 _BitScanForward
-#define _BitScanReverse64 _BitScanReverse
-
 inline void* AlignedMalloc(size_t size, size_t alignment)
 {
     void* ret;
index 9d190bc..83ce967 100644 (file)
@@ -469,7 +469,7 @@ static SIMDINLINE Float SIMDCALL
     uint32_t* pOffsets = (uint32_t*)&idx;
     Float     vResult  = old;
     float*    pResult  = (float*)&vResult;
-    DWORD     index;
+    unsigned long index;
     uint32_t  umask = movemask_ps(mask);
     while (_BitScanForward(&index, umask))
     {
index b5046e4..d0c3ecd 100644 (file)
@@ -635,7 +635,7 @@ static SIMDINLINE Float SIMDCALL
     uint32_t* pOffsets = (uint32_t*)&idx;
     Float     vResult  = old;
     float*    pResult  = (float*)&vResult;
-    DWORD     index;
+    unsigned long index = 0;
     uint32_t  umask = movemask_ps(mask);
     while (_BitScanForward(&index, umask))
     {
index 31f9fe2..107277b 100644 (file)
@@ -987,7 +987,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
             streamMasks |= pState->state.soState.streamMasks[i];
         }
 
-        DWORD maxAttrib;
+        unsigned long maxAttrib;
         if (_BitScanReverse64(&maxAttrib, streamMasks))
         {
             pState->state.feNumAttributes =
@@ -1027,7 +1027,7 @@ void SetupPipeline(DRAW_CONTEXT* pDC)
     // Disable hottile for surfaces with no writes
     if (psState.pfnPixelShader != nullptr)
     {
-        DWORD    rt;
+        unsigned long rt;
         uint32_t rtMask = pState->state.psState.renderTargetMask;
         while (_BitScanForward(&rt, rtMask))
         {
index c6f3b02..868419c 100644 (file)
@@ -609,7 +609,7 @@ inline void SetupRenderBuffers(uint8_t*             pColorBuffer[SWR_NUM_RENDERT
                                uint32_t             colorHotTileMask,
                                RenderOutputBuffers& renderBuffers)
 {
-    DWORD index;
+    unsigned long index;
     while (_BitScanForward(&index, colorHotTileMask))
     {
         assert(index < SWR_NUM_RENDERTARGETS);
@@ -937,7 +937,7 @@ INLINE void OutputMerger8x2(DRAW_CONTEXT*   pDC,
     simdvector blendSrc;
     simdvector blendOut;
 
-    DWORD rt;
+    unsigned long rt;
     while (_BitScanForward(&rt, renderTargetMask))
     {
         renderTargetMask &= ~(1 << rt);
@@ -1250,7 +1250,7 @@ void BackendPixelRate(DRAW_CONTEXT*        pDC,
 
             if (useAlternateOffset)
             {
-                DWORD    rt;
+                unsigned long rt;
                 uint32_t rtMask = state.colorHottileEnable;
                 while (_BitScanForward(&rt, rtMask))
                 {
index 39e078b..7881d36 100644 (file)
@@ -302,7 +302,7 @@ void BackendSampleRate(DRAW_CONTEXT*        pDC,
 
             if (useAlternateOffset)
             {
-                DWORD    rt;
+                unsigned long rt;
                 uint32_t rtMask = state.colorHottileEnable;
                 while (_BitScanForward(&rt, rtMask))
                 {
index 01cb26b..06f78c4 100644 (file)
@@ -285,7 +285,7 @@ void BackendSingleSample(DRAW_CONTEXT*        pDC,
 
             if (useAlternateOffset)
             {
-                DWORD    rt;
+                unsigned long rt;
                 uint32_t rtMask = state.colorHottileEnable;
                 while (_BitScanForward(&rt, rtMask))
                 {
index 75aa467..3673228 100644 (file)
@@ -179,7 +179,7 @@ INLINE void ProcessAttributes(
             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
             if (mask)
             {
-                DWORD comp;
+                unsigned long comp;
                 while (_BitScanForward(&comp, mask))
                 {
                     mask &= ~(1 << comp);
@@ -245,7 +245,7 @@ void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
                          float*                   pRecipW,
                          float*                   pUserClipBuffer)
 {
-    DWORD    clipDist;
+    unsigned long clipDist;
     uint32_t clipDistMask = state.clipDistanceMask;
     while (_BitScanForward(&clipDist, clipDistMask))
     {
@@ -1122,7 +1122,7 @@ endBinTriangles:
     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
 
     // scan remaining valid triangles and bin each separately
-    while (_BitScanForward((DWORD*)&triIndex, triMask))
+    while (_BitScanForward((unsigned long*)&triIndex, triMask))
     {
         uint32_t linkageCount     = state.backendState.numAttributes;
         uint32_t numScalarAttribs = linkageCount * 4;
@@ -1363,7 +1363,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
 
         // scan remaining valid triangles and bin each separately
-        while (_BitScanForward((DWORD*)&primIndex, primMask))
+        while (_BitScanForward((unsigned long*)&primIndex, primMask))
         {
             uint32_t linkageCount     = backendState.numAttributes;
             uint32_t numScalarAttribs = linkageCount * 4;
@@ -1519,7 +1519,7 @@ void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
         // scan remaining valid prims and bin each separately
         const SWR_BACKEND_STATE& backendState = state.backendState;
         uint32_t                 primIndex;
-        while (_BitScanForward((DWORD*)&primIndex, primMask))
+        while (_BitScanForward((unsigned long*)&primIndex, primMask))
         {
             uint32_t linkageCount     = backendState.numAttributes;
             uint32_t numScalarAttribs = linkageCount * 4;
@@ -1818,8 +1818,8 @@ void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
     TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
 
     // scan remaining valid prims and bin each separately
-    uint32_t primIndex;
-    while (_BitScanForward((DWORD*)&primIndex, primMask))
+    unsigned long primIndex;
+    while (_BitScanForward(&primIndex, primMask))
     {
         uint32_t linkageCount     = state.backendState.numAttributes;
         uint32_t numScalarAttribs = linkageCount * 4;
index 1965274..d7186ca 100644 (file)
@@ -409,7 +409,7 @@ public:
         pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
         pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
 
-        DWORD index;
+        unsigned long index;
         while (_BitScanForward(&index, cullMask))
         {
             cullMask &= ~(1 << index);
@@ -881,7 +881,7 @@ private:
         const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
         const float*    pSrc     = reinterpret_cast<const float*>(&vSrc);
         uint32_t        mask     = SIMD_T::movemask_ps(vMask);
-        DWORD           lane;
+        unsigned long  lane;
         while (_BitScanForward(&lane, mask))
         {
             mask &= ~(1 << lane);
index 13aa89e..65bde13 100644 (file)
@@ -534,7 +534,7 @@ static void StreamOut(
 
     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
     {
-        DWORD    slot   = 0;
+        unsigned long slot = 0;
         uint64_t soMask = soState.streamMasks[streamIndex];
 
         // Write all entries into primitive data buffer for SOS.
index 4f1d8cc..c14cd56 100644 (file)
@@ -382,7 +382,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         // overwrite texcoord for point sprites
         uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
-        DWORD    texCoordAttrib = 0;
+        unsigned long texCoordAttrib = 0;
 
         while (_BitScanForward(&texCoordAttrib, texCoordMask))
         {
@@ -424,7 +424,7 @@ void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     if (isPointSpriteTexCoordEnabled)
     {
         uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
-        DWORD    texCoordAttrib = 0;
+        unsigned long texCoordAttrib = 0;
 
         while (_BitScanForward(&texCoordAttrib, texCoordMask))
         {
index b81baa9..2153fe6 100644 (file)
@@ -1511,7 +1511,7 @@ void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
 template <typename RT>
 INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
 {
-    DWORD rt = 0;
+    unsigned long rt = 0;
     while (_BitScanForward(&rt, colorHotTileMask))
     {
         colorHotTileMask &= ~(1 << rt);
@@ -1527,7 +1527,7 @@ INLINE void StepRasterTileY(uint32_t             colorHotTileMask,
                             RenderOutputBuffers& buffers,
                             RenderOutputBuffers& startBufferRow)
 {
-    DWORD rt = 0;
+    unsigned long rt = 0;
     while (_BitScanForward(&rt, colorHotTileMask))
     {
         colorHotTileMask &= ~(1 << rt);
index de81154..925d57f 100644 (file)
@@ -39,7 +39,7 @@ extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256
     SIMD256::store_ps(src, vSrc);
     SIMD256::store_si((SIMD256::Integer*)indices, vIndices);
 
-    DWORD index;
+    unsigned long index;
     while (_BitScanForward(&index, mask))
     {
         mask &= ~(1 << index);
index 60259f0..72e1261 100644 (file)
@@ -91,7 +91,7 @@ struct StreamOutJit : public BuilderGfxMem
     Value* PackMask(uint32_t bitmask)
     {
         std::vector<Constant*> indices(4, C(0));
-        DWORD                  index;
+        unsigned long          index;
         uint32_t               elem = 0;
         while (_BitScanForward(&index, bitmask))
         {