From 93f38e1f1ae23623a03d456b5b46f19f5c500036 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 22 May 2019 21:00:18 +0000 Subject: [PATCH] [X86] Explcitly disable VEXTRACT instruction matching for an immediate of 0. Remove a bunch of isel patterns that become unnecessary. We effectively had a second set of isel patterns that tried to use a regular store instruction and an extract_subreg instruction. Or a masked move and an extract_subreg. These patterns were intended to override the matching of VEXTRACT instructions by taking advantage of the priority of the explicit immediate 0 for the index. This patch instaed just disables the immediate 0 matchin the VEXTRACT patterns. This each of the component pieces of the larger patterns will match by themselves. This found a bug of sorts were we didn't use 128-bit store for 512->128 extract on KNL. Its unclear what the right thing here should be. Using the vextract avoids constraining the register allocator to use xmm0-15. But it always results in a longer encoding if the register allocator ends up choosing xmm0-15 anyway. llvm-svn: 361431 --- llvm/lib/Target/X86/X86InstrAVX512.td | 69 ------------------------- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 12 +++-- llvm/lib/Target/X86/X86InstrVecCompiler.td | 70 -------------------------- llvm/test/CodeGen/X86/avx512-insert-extract.ll | 4 +- 4 files changed, 10 insertions(+), 145 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 7c7c273..97e6969 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3739,75 +3739,6 @@ let Predicates = [HasVLX] in { (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } -multiclass masked_move_for_extract { - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (extract_subvector - (From.VT From.RC:$src), (iPTR 0)))), - To.RC:$src0)), - (Cast.VT (!cast(InstrStr#"rrk") - Cast.RC:$src0, Cast.KRCWM:$mask, - (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; - - def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, - (bitconvert - (To.VT (extract_subvector - (From.VT From.RC:$src), (iPTR 0)))), - Cast.ImmAllZerosV)), - (Cast.VT (!cast(InstrStr#"rrkz") - Cast.KRCWM:$mask, - (To.VT (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx))))>; -} - - -let Predicates = [HasVLX] in { -// A masked extract from the first 128-bits of a 256-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>; - -// A masked extract from the first 128-bits of a 512-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>; - -// A masked extract from the first 256-bits of a 512-bit vector can be -// implemented with masked move. -defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>; -defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>; -defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>; -defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>; -defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>; -} - // Move Int Doubleword to Packed Double Int // let ExeDomain = SSEPackedInt in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index e3c7548..c7f1021 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -980,8 +980,10 @@ def INSERT_get_vinsert256_imm : SDNodeXForm; + node:$index), [{ + // Index 0 can be handled via extract_subreg. + return !isNullConstant(N->getOperand(1)); +}], EXTRACT_get_vextract128_imm>; def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), @@ -991,8 +993,10 @@ def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, - node:$index), [{}], - EXTRACT_get_vextract256_imm>; + node:$index), [{ + // Index 0 can be handled via extract_subreg. + return !isNullConstant(N->getOperand(1)); +}], EXTRACT_get_vextract256_imm>; def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td index 7cb0ec0..e6e9a92 100644 --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -98,76 +98,6 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; -multiclass subvector_store_lowering { - def : Pat<(alignedstore (DstTy (extract_subvector - (SrcTy RC:$src), (iPTR 0))), addr:$dst), - (!cast("VMOV"#AlignedStr#"mr") addr:$dst, - (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; - - def : Pat<(store (DstTy (extract_subvector - (SrcTy RC:$src), (iPTR 0))), addr:$dst), - (!cast("VMOV"#UnalignedStr#"mr") addr:$dst, - (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; -} - -let Predicates = [HasAVX, NoVLX] in { - defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; - defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; - defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; -} - -let Predicates = [HasVLX] in { - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, - sub_xmm>; - defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, - sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v2i64, - v4i64, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v4i32, - v8i32, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v8i16, - v16i16, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR256X, v16i8, - v32i8, sub_xmm>; - - // Special patterns for storing subvector extracts of lower 128-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, - sub_xmm>; - defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, - sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v2i64, - v8i64, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v4i32, - v16i32, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v8i16, - v32i16, sub_xmm>; - defm : subvector_store_lowering<"DQA64Z128", "DQU64Z128", VR512, v16i8, - v64i8, sub_xmm>; - - // Special patterns for storing subvector extracts of lower 256-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, - sub_ymm>; - defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, - sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v4i64, - v8i64, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v8i32, - v16i32, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v16i16, - v32i16, sub_ymm>; - defm : subvector_store_lowering<"DQA64Z256", "DQU64Z256", VR512, v32i8, - v64i8, sub_ymm>; -} - // If we're inserting into an all zeros vector, just use a plain move which // will zero the upper bits. A post-isel hook will take care of removing // any moves that we can prove are unnecessary. diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 3acba30..b81c829 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -1514,7 +1514,7 @@ define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,8), %eax ; KNL-NEXT: andl $1, %eax @@ -1545,7 +1545,7 @@ define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: vpcmpnleud %zmm1, %zmm0, %k1 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; KNL-NEXT: vextracti32x4 $0, %zmm0, -{{[0-9]+}}(%rsp) +; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ; KNL-NEXT: andl $3, %edi ; KNL-NEXT: movzbl -24(%rsp,%rdi,4), %eax ; KNL-NEXT: andl $1, %eax -- 2.7.4