From 670a99c233541b21420151f3c664dad712287457 Mon Sep 17 00:00:00 2001 From: George Kyriazis Date: Thu, 5 Apr 2018 17:51:02 -0500 Subject: [PATCH] swr/rast: add cvt instructions in x86 lowering pass Support generic VCVTPD2PS and VCVTPH2PS in x86 lowering pass. Reviewed-by: Bruce Cherniak --- .../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 70 ++++++++++++---------- .../drivers/swr/rasterizer/jitter/builder_mem.cpp | 14 ----- .../drivers/swr/rasterizer/jitter/builder_mem.h | 3 - .../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 6 +- .../rasterizer/jitter/functionpasses/lower_x86.cpp | 14 ++--- 5 files changed, 48 insertions(+), 59 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py index 2636e60..4a7d2e9 100644 --- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py @@ -42,28 +42,26 @@ inst_aliases = { } intrinsics = [ - ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd4FP64Ty'], - ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdFP32Ty'], - ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16FP32Ty'], - ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimdInt32Ty'], - ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale'], 'mSimd16Int32Ty'], - ['VRCPPS', 'x86_avx_rcp_ps_256', ['a'], 'mSimdFP32Ty'], - ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding'], 'mSimdFP32Ty'], - ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control'], 'mInt32Ty'], - ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b'], 'mSimd32Int8Ty'], - ['VPERMD', 'x86_avx2_permd', ['a', 'idx'], 'mSimdInt32Ty'], - ['VPERMPS', 'x86_avx2_permps', ['idx', 'a'], 'mSimdFP32Ty'], - ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a'], 'mSimdFP32Ty'], - ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a'], 'mSimdFP32Ty'], - ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round'], 'mSimdFP16Ty'], - ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b'], 'mSimdFP32Ty'], - ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b'], 'mInt32Ty'], - ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b'], 'mInt32Ty'], - ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c'], 'mSimdFP32Ty'], - ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a'], 'mInt32Ty'], - ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b'], 'mSimdInt32Ty'], - ['PDEP32', 'x86_bmi_pdep_32', ['a', 'b'], 'mInt32Ty'], - ['RDTSC', 'x86_rdtsc', [], 'mInt64Ty'], + ['VGATHERPD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], + ['VGATHERPS', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], + ['VGATHERDD', ['src', 'pBase', 'indices', 'mask', 'scale'], 'src'], + ['VRCPPS', ['a'], 'a'], + ['VROUND', ['a', 'rounding'], 'a'], + ['BEXTR_32', ['src', 'control'], 'src'], + ['VPSHUFB', ['a', 'b'], 'a'], + ['VPERMD', ['a', 'idx'], 'a'], + ['VPERMPS', ['idx', 'a'], 'a'], + ['VCVTPD2PS', ['a'], 'VectorType::get(mFP32Ty, a->getType()->getVectorNumElements())'], + ['VCVTPH2PS', ['a'], 'VectorType::get(mFP32Ty, a->getType()->getVectorNumElements())'], + ['VCVTPS2PH', ['a', 'round'], 'mSimdFP16Ty'], + ['VHSUBPS', ['a', 'b'], 'a'], + ['VPTESTC', ['a', 'b'], 'mInt32Ty'], + ['VPTESTZ', ['a', 'b'], 'mInt32Ty'], + ['VFMADDPS', ['a', 'b', 'c'], 'a'], + ['VMOVMSKPS', ['a'], 'mInt32Ty'], + ['VPHADDD', ['a', 'b'], 'a'], + ['PDEP32', ['a', 'b'], 'a'], + ['RDTSC', [], 'mInt64Ty'], ] llvm_intrinsics = [ @@ -231,19 +229,31 @@ def generate_meta_h(output_dir): functions = [] for inst in intrinsics: + name = inst[0] + args = inst[1] + ret = inst[2] + #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2]))) - if len(inst[2]) != 0: - declargs = 'Value* ' + ', Value* '.join(inst[2]) - decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs) + if len(args) != 0: + declargs = 'Value* ' + ', Value* '.join(args) + decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (name, declargs) else: - decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0]) + decl = 'Value* %s(const llvm::Twine& name = "")' % (name) + + # determine the return type of the intrinsic. It can either be: + # - type of one of the input arguments + # - snippet of code to set the return type + + if ret in args: + returnTy = ret + '->getType()' + else: + returnTy = ret functions.append({ 'decl' : decl, - 'name' : inst[0], - 'intrin' : inst[1], - 'args' : inst[2], - 'returnType': inst[3] + 'name' : name, + 'args' : args, + 'returnType': returnTy }) MakoTemplateWriter.to_file( diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp index eccf0ad..c791278 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp @@ -176,13 +176,6 @@ namespace SwrJit return VGATHERPS(vSrc, pBase, vIndices, vMask, C(scale)); } - Value *Builder::GATHERPS_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage) - { - AssertMemoryUsageParams(pBase, usage); - - return VGATHERPS_16(vSrc, pBase, vIndices, vMask, C(scale)); - } - ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads @@ -198,13 +191,6 @@ namespace SwrJit return VGATHERDD(vSrc, pBase, vIndices, vMask, C(scale)); } - Value *Builder::GATHERDD_16(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage) - { - AssertMemoryUsageParams(pBase, usage); - - return VGATHERDD_16(vSrc, pBase, vIndices, vMask, C(scale)); - } - ////////////////////////////////////////////////////////////////////////// /// @brief Generate a masked gather operation in LLVM IR. If not /// supported on the underlying platform, emulate it with loads diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h index f229da3..9ccac4f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h @@ -78,13 +78,10 @@ void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); -Value *GATHERPS_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); - void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); -Value *GATHERDD_16(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index 0fbfd21..8d659d0 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -1563,10 +1563,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args) if (bFP) { // extract 128 bit lanes to sign extend each component - /// @todo Force 8-wide cvt until we support generic cvt in x86 lowering pass - Function* pCvtPh2Ps = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_vcvtph2ps_256); - Value *temp_lo = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); - Value *temp_hi = CALL(pCvtPh2Ps, BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); + Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty)); + Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty)); vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi); } diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp index 11a2397..b27335f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp @@ -76,8 +76,6 @@ namespace SwrJit {"meta.intrinsic.VROUND", Intrinsic::x86_avx_round_ps_256}, {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32}, {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b}, - {"meta.intrinsic.VCVTPD2PS", Intrinsic::x86_avx_cvt_pd2_ps_256}, - {"meta.intrinsic.VCVTPH2PS", Intrinsic::x86_vcvtph2ps_256}, {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256}, {"meta.intrinsic.VHSUBPS", Intrinsic::x86_avx_hsub_ps_256}, {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256}, @@ -101,27 +99,27 @@ namespace SwrJit {"meta.intrinsic.VPERMPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, {"meta.intrinsic.VPERMD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}}, {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, }, { // AVX2 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}}, {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}}, {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}}, }, { // AVX512 {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}}, {"meta.intrinsic.VPERMPS", {{Intrinsic::x86_avx512_mask_permvar_sf_256, Intrinsic::x86_avx512_mask_permvar_sf_512}, NO_EMU}}, {"meta.intrinsic.VPERMD", {{Intrinsic::x86_avx512_mask_permvar_si_256, Intrinsic::x86_avx512_mask_permvar_si_512}, NO_EMU}}, {"meta.intrinsic.VGATHERPS", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERPS_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, {"meta.intrinsic.VGATHERDD", {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, - {"meta.intrinsic.VGATHERDD_16",{{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}}, + {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512 }, NO_EMU}}, + {"meta.intrinsic.VCVTPH2PS", {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512 }, NO_EMU}}, } }; -- 2.7.4