swr/rast: Use gather instruction for i32gather_ps on simd16/avx512
authorTim Rowley <timothy.o.rowley@intel.com>
Mon, 13 Nov 2017 21:11:21 +0000 (15:11 -0600)
committerTim Rowley <timothy.o.rowley@intel.com>
Tue, 14 Nov 2017 17:39:02 +0000 (11:39 -0600)
Speed up avx512 platforms; fixes performance regression caused
by swithc to simdlib.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
Cc: mesa-stable@lists.freedesktop.org
src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl

index 95e4c31..c13b9f6 100644 (file)
@@ -484,17 +484,7 @@ SIMD_WRAPPER_2(unpacklo_ps);
 template<ScaleFactor ScaleT>
 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
-    uint32_t *pOffsets = (uint32_t*)&idx;
-    Float vResult;
-    float* pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
-    }
-
-    return vResult;
+    return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
 }
 
 static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)