From abd4aa68cc1a7d8a20547069c617388eedb3673e Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 24 Feb 2016 13:34:50 -0600 Subject: [PATCH] swr: [rasterizer core] backend reorganization --- .../drivers/swr/rasterizer/common/simdintrin.h | 62 +++++++ src/gallium/drivers/swr/rasterizer/core/api.cpp | 38 +++-- .../drivers/swr/rasterizer/core/backend.cpp | 186 ++------------------- src/gallium/drivers/swr/rasterizer/core/backend.h | 171 +++++++++++++++++-- src/gallium/drivers/swr/rasterizer/core/context.h | 1 + .../drivers/swr/rasterizer/core/rasterizer.cpp | 8 +- src/gallium/drivers/swr/rasterizer/core/state.h | 8 +- .../drivers/swr/rasterizer/memory/tilingtraits.h | 58 +------ 8 files changed, 276 insertions(+), 256 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 8fa6d9e..9022094 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -136,6 +136,8 @@ __m256i func(__m256i a, __m256i b)\ #define _simd_add_epi8 _simdemu_add_epi8 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 +#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8 +#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16 #define _simd_movemask_epi8 _simdemu_movemask_epi8 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) @@ -158,6 +160,8 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) +SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8) +SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16) #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) @@ -295,6 +299,8 @@ int _simdemu_movemask_epi8(__m256i a) #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 +#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8 +#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16 #define _simd_movemask_epi8 _mm256_movemask_epi8 #endif @@ -783,5 +789,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons return vplaneps(vA, vB, vC, vI, vJ); } +INLINE +UINT pdep_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pdep_u32(a, mask); +#else + UINT result = 0; + + // copied from http://wm.ite.pl/articles/pdep-soft-emu.html + // using bsf instead of funky loop + DWORD maskIndex; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. populate LSB from src + const UINT LSB = (UINT)((int)(a << 31) >> 31); + + // 3. copy bit from mask + result |= LSB & lowest; + + // 4. clear lowest bit + mask &= ~lowest; + + // 5. prepare for next iteration + a >>= 1; + } + + return result; +#endif +} + +INLINE +UINT pext_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pext_u32(a, mask); +#else + UINT result = 0; + DWORD maskIndex; + uint32_t currentBit = 0; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. copy bit from mask + result |= ((a & lowest) > 0) << currentBit++; + + // 3. clear lowest bit + mask &= ~lowest; + } + return result; +#endif +} #endif//__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index e18f9e7..f2061e6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -721,16 +721,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; } } - +// templated backend function tables +extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; +extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; +extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; +extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; const SWR_RASTSTATE &rastState = pState->state.rastState; + const SWR_PS_STATE &psState = pState->state.psState; BACKEND_FUNCS& backendFuncs = pState->backendFuncs; const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0; // setup backend - if (pState->state.psState.pfnPixelShader == nullptr) + if (psState.pfnPixelShader == nullptr) { backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; // always need to generate I & J per sample for Z interpolation @@ -739,41 +748,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC) else { const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0; - const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; + const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; // currently only support 'normal' input coverage - SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || - pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); + SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || + psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); - SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask; + SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; // select backend function - switch(pState->state.psState.shadingRate) + switch(psState.shadingRate) { case SWR_SHADING_RATE_PIXEL: if(bMultisampleEnable) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; } else { // always need to generate I & J per pixel for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X]; + backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X]; } break; case SWR_SHADING_RATE_SAMPLE: SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; break; - case SWR_SHADING_RATE_COARSE: default: SWR_ASSERT(0 && "Invalid shading rate"); break; @@ -864,7 +872,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) uint32_t numRTs = pState->state.psState.numRenderTargets; pState->state.colorHottileEnable = 0; - if(pState->state.psState.pfnPixelShader != nullptr) + if (psState.pfnPixelShader != nullptr) { for (uint32_t rt = 0; rt < numRTs; ++rt) { diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 8c1858b..b8f1e5a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -418,11 +418,10 @@ void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t mac } #if KNOB_SIMD_WIDTH == 8 -const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 }; -const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 }; -const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -#define MASK 0xff +const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; +const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; +const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; #else #error Unsupported vector width #endif @@ -457,155 +456,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala return _simd_movemask_ps(vClipMask); } -template -INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) -{ - - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - __m256i mask[2]; - __m256i sampleCoverage[2]; - if(bIsStandardPattern) - { - __m256i src = _mm256_set1_epi32(0); - __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - - if(MultisampleTraits::numSamples == 1) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if(MultisampleTraits::numSamples == 2) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if(MultisampleTraits::numSamples == 4) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if(MultisampleTraits::numSamples == 8) - { - mask[0] = _mm256_set1_epi32(-1); - } - else if(MultisampleTraits::numSamples == 16) - { - mask[0] = _mm256_set1_epi32(-1); - mask[1] = _mm256_set1_epi32(-1); - index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); - if(MultisampleTraits::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); - } - } - else - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if(MultisampleTraits::numSamples == 1) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); - } - else if(MultisampleTraits::numSamples == 2) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); - } - else if(MultisampleTraits::numSamples == 4) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); - } - else if(MultisampleTraits::numSamples == 8) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - } - else if(MultisampleTraits::numSamples == 16) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); - } - } - - mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); - // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - - __m256i packedCoverage1; - if(MultisampleTraits::numSamples > 8) - { - // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); - } - -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - __m256i packedSampleCoverage; - if(MultisampleTraits::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#else - __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - - __m256i packedSampleCoverage; - if(MultisampleTraits::numSamples > 8) - { - permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); - - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#endif - - for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) - { - // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if(!bForcedSampleCount) - { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; - } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); - } -} - -template -INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); -} - template INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) { @@ -889,9 +739,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); // pixel center - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { @@ -903,9 +753,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(coverageMask & MASK) { RDTSC_START(BEBarycentric); - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); @@ -1082,15 +932,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); // pixel center - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); @@ -1318,14 +1168,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { simdscalar vZ[MultisampleTraits::numSamples]; - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // set pixel center positions - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); if (bInputCoverage) { @@ -1585,12 +1435,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { // UL pixel corners - simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // iterate over active samples unsigned long sample = 0; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 53089e5..91b8ccc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -29,7 +29,8 @@ #pragma once #include "common/os.h" -#include "core/context.h" +#include "core/context.h" +#include "core/multisample.h" void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); @@ -39,6 +40,9 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); void InitClearTilesTable(); +simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); +void InitBackendFuncTables(); +void InitCPSFuncTables(); enum SWR_BACKEND_FUNCS { @@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS SWR_BACKEND_MSAA_SAMPLE_RATE, SWR_BACKEND_FUNCS_MAX, }; -void InitBackendFuncTables(); -extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; -extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; -extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; -extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; +#if KNOB_SIMD_WIDTH == 8 +extern const __m256 vCenterOffsetsX; +extern const __m256 vCenterOffsetsY; +extern const __m256 vULOffsetsX; +extern const __m256 vULOffsetsY; +#define MASK 0xff +#endif + +template +INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) +{ + + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + + __m256i mask[2]; + __m256i sampleCoverage[2]; + if(bIsStandardPattern) + { + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + + if(MultisampleTraits::numSamples == 1) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + } + else if(MultisampleTraits::numSamples == 2) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + } + else if(MultisampleTraits::numSamples == 4) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + } + else if(MultisampleTraits::numSamples == 8) + { + mask[0] = _mm256_set1_epi32(-1); + } + else if(MultisampleTraits::numSamples == 16) + { + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(MultisampleTraits::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + } + } + else + { + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + if(MultisampleTraits::numSamples == 1) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + } + else if(MultisampleTraits::numSamples == 2) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + } + else if(MultisampleTraits::numSamples == 4) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + } + else if(MultisampleTraits::numSamples == 8) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + } + else if(MultisampleTraits::numSamples == 16) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + } + } + + mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane + __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + + __m256i packedCoverage1; + if(MultisampleTraits::numSamples > 8) + { + // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); + } + +#if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + + __m256i packedSampleCoverage; + if(MultisampleTraits::numSamples > 8) + { + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#else + __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); + + __m256i packedSampleCoverage; + if(MultisampleTraits::numSamples > 8) + { + permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); + + // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane + packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#endif + + for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + { + // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); + + if(!bForcedSampleCount) + { + // input coverage has to be anded with sample mask if MSAA isn't forced on + inputMask[i] &= sampleMask; + } + + // shift to the next pixel in the 4x2 + packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); + } +} + +template +INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage(coverageMask, inputMask, sampleMask); + inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index d75d975..523e7ac 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -83,6 +83,7 @@ struct SWR_TRIANGLE_DESC float *pUserClipBuffer; uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; + uint64_t anyCoveredSamples; TRI_FLAGS triFlags; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 07c9eed..52fb7c8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -752,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, for (uint32_t tileX = tX; tileX <= maxX; ++tileX) { - uint64_t anyCoveredSamples = 0; + triDesc.anyCoveredSamples = 0; // is the corner of the edge outside of the raster tile? (vEdge < 0) int mask0, mask1, mask2; @@ -785,7 +785,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; if ((mask0 & mask1 & mask2) == 0xf) { - anyCoveredSamples = triDesc.coverageMask[sampleNum]; + triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; // trivial accept, all 4 corners of all 3 edges are negative // i.e. raster tile completely inside triangle RDTSC_EVENT(BETrivialAccept, 1, 0); @@ -840,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } RDTSC_STOP(BERasterizePartial, 0, 0); - anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; } } else @@ -861,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else #endif - if(anyCoveredSamples) + if(triDesc.anyCoveredSamples) { RDTSC_START(BEPixelBackend); backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 2758555..a71eb6d 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -307,6 +307,8 @@ struct PixelPositions simdscalar centroid; }; +#define SWR_MAX_NUM_MULTISAMPLES 16 + ////////////////////////////////////////////////////////////////////////// /// SWR_PS_CONTEXT /// @brief Input to pixel shader. @@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT uint32_t frontFace; // IN: front- 1, back- 0 uint32_t primID; // IN: primitive ID uint32_t sampleIndex; // IN: sampleIndex + }; ////////////////////////////////////////////////////////////////////////// @@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE }; static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); -#define SWR_MAX_NUM_MULTISAMPLES 16 enum SWR_MULTISAMPLE_COUNT { SWR_MULTISAMPLE_1X = 0, @@ -786,6 +788,7 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext); typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); +typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*); ////////////////////////////////////////////////////////////////////////// @@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE uint8_t numComponents[KNOB_NUM_ATTRIBUTES]; }; + union SWR_DEPTH_STENCIL_STATE { struct @@ -980,7 +984,6 @@ enum SWR_SHADING_RATE { SWR_SHADING_RATE_PIXEL, SWR_SHADING_RATE_SAMPLE, - SWR_SHADING_RATE_COARSE, SWR_SHADING_RATE_MAX, }; @@ -1024,4 +1027,5 @@ struct SWR_PS_STATE uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with uint32_t usesUAV : 1; // pixel shader accesses UAV uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test + }; diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h index 50f8e57..381ac89 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h @@ -28,6 +28,7 @@ #pragma once #include "core/state.h" +#include "common/simdintrin.h" template struct TilingTraits @@ -130,63 +131,6 @@ template struct TilingTraits static UINT GetPdepY() { return 0x1ea; } }; -INLINE -UINT pdep_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH==KNOB_ARCH_AVX2 - return _pdep_u32(a, mask); -#else - UINT result = 0; - - // copied from http://wm.ite.pl/articles/pdep-soft-emu.html - // using bsf instead of funky loop - DWORD maskIndex; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. populate LSB from src - const UINT LSB = (UINT)((int)(a << 31) >> 31); - - // 3. copy bit from mask - result |= LSB & lowest; - - // 4. clear lowest bit - mask &= ~lowest; - - // 5. prepare for next iteration - a >>= 1; - } - - return result; -#endif -} - -INLINE -UINT pext_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH==KNOB_ARCH_AVX2 - return _pext_u32(a, mask); -#else - UINT result = 0; - DWORD maskIndex; - uint32_t currentBit = 0; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. copy bit from mask - result |= ((a & lowest) > 0) << currentBit++; - - // 3. clear lowest bit - mask &= ~lowest; - } - return result; -#endif -} - ////////////////////////////////////////////////////////////////////////// /// @brief Computes the tileID for 2D tiled surfaces /// @param pitch - surface pitch in bytes -- 2.7.4