From 5dd9ad1570056e2e93689bae16ea78fc8b714448 Mon Sep 17 00:00:00 2001 From: Jan Zielinski Date: Wed, 24 Jul 2019 12:25:27 +0200 Subject: [PATCH] swr/rasterizer: Better implementation of scatter Added support for avx512 scatter instruction. Non-avx512 will now call into a C function to do the scatter emulation. This has better jit compile performance than the previous approach of jitting scalar loops. Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/Makefile.sources | 1 + src/gallium/drivers/swr/meson.build | 1 + .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 1 + .../swr/rasterizer/jitter/builder_gfx_mem.cpp | 12 +- .../drivers/swr/rasterizer/jitter/builder_mem.cpp | 10 + .../rasterizer/jitter/functionpasses/lower_x86.cpp | 230 ++++++++++++++------- .../swr/rasterizer/jitter/shader_lib/Scatter.cpp | 49 +++++ 7 files changed, 225 insertions(+), 79 deletions(-) create mode 100644 src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources index b298356..720bd59 100644 --- a/src/gallium/drivers/swr/Makefile.sources +++ b/src/gallium/drivers/swr/Makefile.sources @@ -156,6 +156,7 @@ JITTER_CXX_SOURCES := \ rasterizer/jitter/streamout_jit.cpp \ rasterizer/jitter/streamout_jit.h \ rasterizer/jitter/shader_lib/DebugOutput.cpp \ + rasterizer/jitter/shader_lib/Scatter.cpp \ rasterizer/jitter/functionpasses/passes.h \ rasterizer/jitter/functionpasses/lower_x86.cpp diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build index 9e07724..6587475 100644 --- a/src/gallium/drivers/swr/meson.build +++ b/src/gallium/drivers/swr/meson.build @@ -82,6 +82,7 @@ files_swr_mesa = files( 'rasterizer/jitter/streamout_jit.cpp', 'rasterizer/jitter/streamout_jit.h', 'rasterizer/jitter/shader_lib/DebugOutput.cpp', + 'rasterizer/jitter/shader_lib/Scatter.cpp', 'rasterizer/jitter/functionpasses/lower_x86.cpp', 'rasterizer/memory/SurfaceState.h' ) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index ac685ad..0cd7ae7 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -45,6 +45,7 @@ intrinsics = [ ['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], ['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], ['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], + ['VSCATTERPS', ['pBase', 'mask', 'indices', 'src', 'scale'], 'src'], ['VRCPPS', ['a'], 'a'], ['VROUND', ['a', 'rounding'], 'a'], ['BEXTR_32', ['src', 'control'], 'src'], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp index b3d0b70..adf8924 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp @@ -237,7 +237,8 @@ namespace SwrJit return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage); } - StoreInst* BuilderGfxMem::STORE(Value *Val, Value *Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage) + StoreInst* + BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); @@ -245,7 +246,11 @@ namespace SwrJit return Builder::STORE(Val, Ptr, isVolatile, Ty, usage); } - StoreInst* BuilderGfxMem::STORE(Value* Val, Value* BasePtr, const std::initializer_list& offset, Type* Ty, JIT_MEM_CLIENT usage) + StoreInst* BuilderGfxMem::STORE(Value* Val, + Value* BasePtr, + const std::initializer_list& offset, + Type* Ty, + JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(BasePtr, usage); @@ -253,7 +258,8 @@ namespace SwrJit return Builder::STORE(Val, BasePtr, offset, Ty, usage); } - CallInst* BuilderGfxMem::MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask, Type* Ty, JIT_MEM_CLIENT usage) + CallInst* BuilderGfxMem::MASKED_STORE( + Value* Val, Value* Ptr, unsigned Align, Value* Mask, Type* Ty, JIT_MEM_CLIENT usage) { AssertGFXMemoryParams(Ptr, usage); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index 90a0e03..267c544 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -647,6 +647,10 @@ namespace SwrJit { AssertMemoryUsageParams(pDst, usage); + SWR_ASSERT(vSrc->getType()->getVectorElementType()->isFloatTy()); + VSCATTERPS(pDst, vMask, vOffsets, vSrc, C(1)); + return; + /* Scatter algorithm while(Index = BitScanForward(mask)) @@ -657,6 +661,10 @@ namespace SwrJit */ + /* + + // Reference implementation kept around for reference + BasicBlock* pCurBB = IRB()->GetInsertBlock(); Function* pFunc = pCurBB->getParent(); Type* pSrcTy = vSrc->getType()->getVectorElementType(); @@ -744,5 +752,7 @@ namespace SwrJit // Move builder to beginning of post loop IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin()); + + */ } } // namespace SwrJit diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp index c34959d..2196aaf 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp @@ -32,8 +32,12 @@ #include "passes.h" #include "JitManager.h" +#include "common/simdlib.hpp" + #include +extern "C" void ScatterPS_256(uint8_t*, SIMD256::Integer, SIMD256::Float, uint8_t, uint32_t); + namespace llvm { // foward declare the initializer @@ -88,6 +92,8 @@ namespace SwrJit Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); Instruction* + VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); + Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst); @@ -102,88 +108,61 @@ namespace SwrJit static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1; + // clang-format off static std::map intrinsicMap2[] = { - // 256 wide 512 wide + // 256 wide 512 wide { // AVX - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", - {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", - {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, }, { // AVX2 - {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VPERMPS", - {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", - {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VGATHERPD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VCVTPH2PS", - {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, - {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, - {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}}, }, { // AVX512 - {"meta.intrinsic.VRCPPS", - {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, + {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, #if LLVM_VERSION_MAJOR < 7 - {"meta.intrinsic.VPERMPS", - {{Intrinsic::x86_avx512_mask_permvar_sf_256, - Intrinsic::x86_avx512_mask_permvar_sf_512}, - NO_EMU}}, - {"meta.intrinsic.VPERMD", - {{Intrinsic::x86_avx512_mask_permvar_si_256, - Intrinsic::x86_avx512_mask_permvar_si_512}, - NO_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, #else - {"meta.intrinsic.VPERMPS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, - {"meta.intrinsic.VPERMD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, + {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, #endif - {"meta.intrinsic.VGATHERPD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VSCATTERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VSCATTER_EMU}}, #if LLVM_VERSION_MAJOR < 7 - {"meta.intrinsic.VCVTPD2PS", - {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, - NO_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512}, NO_EMU}}, #else - {"meta.intrinsic.VCVTPD2PS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}}, #endif - {"meta.intrinsic.VCVTPH2PS", - {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, - NO_EMU}}, - {"meta.intrinsic.VROUND", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, - {"meta.intrinsic.VHSUBPS", - {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}, + {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512}, NO_EMU}}, + {"meta.intrinsic.VROUND", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}}, + {"meta.intrinsic.VHSUBPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}}, }}; + // clang-format on struct LowerX86 : public FunctionPass { @@ -209,6 +188,27 @@ namespace SwrJit SWR_ASSERT(false, "Unsupported AVX architecture."); mTarget = AVX; } + + // Setup scatter function for 256 wide + uint32_t curWidth = B->mVWidth; + B->SetTargetWidth(8); + std::vector args = { + B->mInt8PtrTy, // pBase + B->mSimdInt32Ty, // vIndices + B->mSimdFP32Ty, // vSrc + B->mInt8Ty, // mask + B->mInt32Ty // scale + }; + + FunctionType* pfnScatterTy = FunctionType::get(B->mVoidTy, args, false); + mPfnScatter256 = cast( + B->JM()->mpCurrentModule->getOrInsertFunction("ScatterPS_256", pfnScatterTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("ScatterPS_256") == nullptr) + { + sys::DynamicLibrary::AddSymbol("ScatterPS_256", (void*)&ScatterPS_256); + } + + B->SetTargetWidth(curWidth); } // Try to decipher the vector type of the instruction. This does not work properly @@ -392,23 +392,39 @@ namespace SwrJit virtual bool runOnFunction(Function& F) { std::vector toRemove; + std::vector bbs; + + // Make temp copy of the basic blocks and instructions, as the intrinsic + // replacement code might invalidate the iterators + for (auto& b : F.getBasicBlockList()) + { + bbs.push_back(&b); + } - for (auto& BB : F.getBasicBlockList()) + for (auto* BB : bbs) { - for (auto& I : BB.getInstList()) + std::vector insts; + for (auto& i : BB->getInstList()) { - if (CallInst* pCallInst = dyn_cast(&I)) + insts.push_back(&i); + } + + for (auto* I : insts) + { + if (CallInst* pCallInst = dyn_cast(I)) { Function* pFunc = pCallInst->getCalledFunction(); if (pFunc) { if (pFunc->getName().startswith("meta.intrinsic")) { - B->IRB()->SetInsertPoint(&I); + B->IRB()->SetInsertPoint(I); Instruction* pReplace = ProcessIntrinsic(pCallInst); - SWR_ASSERT(pReplace); toRemove.push_back(pCallInst); - pCallInst->replaceAllUsesWith(pReplace); + if (pReplace) + { + pCallInst->replaceAllUsesWith(pReplace); + } } } } @@ -428,10 +444,9 @@ namespace SwrJit virtual void getAnalysisUsage(AnalysisUsage& AU) const {} JitManager* JM() { return B->JM(); } - - Builder* B; - - TargetArch mTarget; + Builder* B; + TargetArch mTarget; + Function* mPfnScatter256; static char ID; ///< Needed by LLVM to generate ID for FunctionPass. }; @@ -639,6 +654,69 @@ namespace SwrJit return cast(v32Gather); } + Instruction* + VSCATTER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst) + { + Builder* B = pThis->B; + auto pBase = pCallInst->getArgOperand(0); + auto vi1Mask = pCallInst->getArgOperand(1); + auto vi32Indices = pCallInst->getArgOperand(2); + auto v32Src = pCallInst->getArgOperand(3); + auto i32Scale = pCallInst->getArgOperand(4); + + if (arch != AVX512) + { + // Call into C function to do the scatter. This has significantly better compile perf + // compared to jitting scatter loops for every scatter + if (width == W256) + { + auto mask = B->BITCAST(vi1Mask, B->mInt8Ty); + B->CALL(pThis->mPfnScatter256, {pBase, vi32Indices, v32Src, mask, i32Scale}); + } + else + { + // Need to break up 512 wide scatter to two 256 wide + auto maskLo = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + auto indicesLo = + B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + auto srcLo = B->VSHUFFLE(v32Src, v32Src, B->C({0, 1, 2, 3, 4, 5, 6, 7})); + + auto mask = B->BITCAST(maskLo, B->mInt8Ty); + B->CALL(pThis->mPfnScatter256, {pBase, indicesLo, srcLo, mask, i32Scale}); + + auto maskHi = B->VSHUFFLE(vi1Mask, vi1Mask, B->C({8, 9, 10, 11, 12, 13, 14, 15})); + auto indicesHi = + B->VSHUFFLE(vi32Indices, vi32Indices, B->C({8, 9, 10, 11, 12, 13, 14, 15})); + auto srcHi = B->VSHUFFLE(v32Src, v32Src, B->C({8, 9, 10, 11, 12, 13, 14, 15})); + + mask = B->BITCAST(maskHi, B->mInt8Ty); + B->CALL(pThis->mPfnScatter256, {pBase, indicesHi, srcHi, mask, i32Scale}); + } + return nullptr; + } + + Value* iMask; + Function* pX86IntrinFunc; + if (width == W256) + { + // No direct intrinsic supported in llvm to scatter 8 elem with 32bit indices, but we + // can use the scatter of 8 elements with 64bit indices + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_scatter_qps_512); + + auto vi32IndicesExt = B->Z_EXT(vi32Indices, B->mSimdInt64Ty); + iMask = B->BITCAST(vi1Mask, B->mInt8Ty); + B->CALL(pX86IntrinFunc, {pBase, iMask, vi32IndicesExt, v32Src, i32Scale}); + } + else if (width == W512) + { + pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, + Intrinsic::x86_avx512_scatter_dps_512); + iMask = B->BITCAST(vi1Mask, B->mInt16Ty); + B->CALL(pX86IntrinFunc, {pBase, iMask, vi32Indices, v32Src, i32Scale}); + } + return nullptr; + } // No support for vroundps in avx512 (it is available in kncni), so emulate with avx // instructions diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp new file mode 100644 index 0000000..de81154 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/Scatter.cpp @@ -0,0 +1,49 @@ +/**************************************************************************** + * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * @file Scatter.cpp + * + * @brief Shader support library implementation for scatter emulation + * + * Notes: + * + ******************************************************************************/ +#include +#include "common/os.h" +#include "common/simdlib.hpp" + +extern "C" void ScatterPS_256(uint8_t* pBase, SIMD256::Integer vIndices, SIMD256::Float vSrc, uint8_t mask, uint32_t scale) +{ + OSALIGN(float, 32) src[8]; + OSALIGN(uint32_t, 32) indices[8]; + + SIMD256::store_ps(src, vSrc); + SIMD256::store_si((SIMD256::Integer*)indices, vIndices); + + DWORD index; + while (_BitScanForward(&index, mask)) + { + mask &= ~(1 << index); + + *(float*)(pBase + indices[index] * scale) = src[index]; + } +} -- 2.7.4