From 92ea7a7b4877f931770a2bf2c997db0d4bf8bf86 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 18 Jul 2018 07:31:32 +0000 Subject: [PATCH] [X86] Enable commuting of VUNPCKHPD to VMOVLHPS to enable load folding by using VMOVLPS with a modified address. This required an annoying amount of tablegen multiclass changes to make only VUNPCKHPDZ128rr commutable. llvm-svn: 337357 --- llvm/lib/Target/X86/X86InstrAVX512.td | 31 +++++++++++++++++--------- llvm/lib/Target/X86/X86InstrInfo.cpp | 20 +++++++++++++---- llvm/lib/Target/X86/X86InstrSSE.td | 3 ++- llvm/test/CodeGen/X86/fma.ll | 24 ++++++++++---------- llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll | 3 +-- 5 files changed, 51 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index b239f23..2d95925 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -198,7 +198,8 @@ multiclass AVX512_maskable_custom O, Format F, list ZeroMaskingPattern, string MaskingConstraint = "", bit IsCommutable = 0, - bit IsKCommutable = 0> { + bit IsKCommutable = 0, + bit IsKZCommutable = IsCommutable> { let isCommutable = IsCommutable in def NAME: AVX512 O, Format F, // Zero mask does not add any restrictions to commute operands transformation. // So, it is Ok to use IsCommutable instead of IsKCommutable. - let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<> + let isCommutable = IsKZCommutable in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512 O, Format F, X86VectorVTInfo _, SDNode Select = vselect, string MaskingConstraint = "", bit IsCommutable = 0, - bit IsKCommutable = 0> : + bit IsKCommutable = 0, + bit IsKZCommutable = IsCommutable> : AVX512_maskable_custom O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], MaskingConstraint, IsCommutable, - IsKCommutable>; + IsKCommutable, IsKZCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -276,13 +278,15 @@ multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, string AttSrcAsm, string IntelSrcAsm, dag RHS, bit IsCommutable = 0, bit IsKCommutable = 0, + bit IsKZCommutable = IsCommutable, SDNode Select = vselect> : AVX512_maskable_common; + Select, "$src0 = $dst", IsCommutable, IsKCommutable, + IsKZCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. @@ -292,7 +296,7 @@ multiclass AVX512_maskable_scalar O, Format F, X86VectorVTInfo _, dag RHS, bit IsCommutable = 0> : AVX512_maskable; + RHS, IsCommutable, 0, IsCommutable, X86selects>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -5312,12 +5316,14 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc, multiclass avx512_fp_packed opc, string OpcodeStr, SDPatternOperator OpNode, X86VectorVTInfo _, X86FoldableSchedWrite sched, - bit IsCommutable> { + bit IsCommutable, + bit IsKZCommutable = IsCommutable> { let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm rr: AVX512_maskable, + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable, 0, + IsKZCommutable>, EVEX_4V, Sched<[sched]>; let mayLoad = 1 in { defm rm: AVX512_maskable opc, string OpcodeStr, multiclass avx512_fp_binop_p opc, string OpcodeStr, SDPatternOperator OpNode, Predicate prd, X86SchedWriteSizes sched, - bit IsCommutable = 0> { + bit IsCommutable = 0, + bit IsPD128Commutable = IsCommutable> { let Predicates = [prd] in { defm PSZ : avx512_fp_packed, EVEX_V512, PS, @@ -5380,7 +5387,8 @@ multiclass avx512_fp_binop_p opc, string OpcodeStr, SDPatternOperator Op sched.PS.YMM, IsCommutable>, EVEX_V256, PS, EVEX_CD8<32, CD8VF>; defm PDZ128 : avx512_fp_packed, EVEX_V128, PD, VEX_W, + sched.PD.XMM, IsPD128Commutable, + IsCommutable>, EVEX_V128, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm PDZ256 : avx512_fp_packed, EVEX_V256, PD, VEX_W, @@ -6426,6 +6434,7 @@ def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst), "vmovlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))]>, Sched<[SchedWriteFShuffle.XMM]>, EVEX_4V; +let isCommutable = 1 in def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), "vmovhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -10854,7 +10863,7 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$sr //===----------------------------------------------------------------------===// defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512, - SchedWriteFShuffleSizes>; + SchedWriteFShuffleSizes, 0, 1>; defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512, SchedWriteFShuffleSizes>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index dd1c658..ee0a6e2 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1692,14 +1692,22 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, OpIdx1, OpIdx2); } case X86::MOVHLPSrr: - case X86::UNPCKHPDrr: { + case X86::UNPCKHPDrr: + case X86::VMOVHLPSrr: + case X86::VUNPCKHPDrr: + case X86::VMOVHLPSZrr: + case X86::VUNPCKHPDZ128rr: { assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!"); unsigned Opc = MI.getOpcode(); switch (Opc) { - default: llvm_unreachable("Unreachable!"); - case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break; - case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break; + default: llvm_unreachable("Unreachable!"); + case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break; + case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break; + case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break; + case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break; + case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break; + case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break; } auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); @@ -1990,6 +1998,10 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, return false; case X86::MOVHLPSrr: case X86::UNPCKHPDrr: + case X86::VMOVHLPSrr: + case X86::VUNPCKHPDrr: + case X86::VMOVHLPSZrr: + case X86::VUNPCKHPDZ128rr: if (Subtarget.hasSSE2()) return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); return false; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 06a8799..e9a9a99 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -812,6 +812,7 @@ let Predicates = [UseAVX] in { [(set VR128:$dst, (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))]>, VEX_4V, Sched<[SchedWriteFShuffle.XMM]>, VEX_WIG; + let isCommutable = 1 in def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -2190,7 +2191,7 @@ defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SchedWriteFShuffle.XMM, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; + SchedWriteFShuffle.XMM, SSEPackedDouble, 1>, PD, VEX_4V, VEX_WIG; defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", SchedWriteFShuffle.XMM, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll index b7a9af7..a8dd97b 100644 --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -1410,10 +1410,10 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> % ; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x30] ; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] ; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] -; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x44,0x24,0x40] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] +; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x58] +; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x28] ; FMACALL32-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] @@ -1547,10 +1547,10 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> % ; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x44,0x24,0x70] ; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] ; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x44,0x24,0x50] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] +; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x44,0x24,0x68] +; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x30] ; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload @@ -1809,10 +1809,10 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> % ; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x28,0x84,0x24,0x60,0x01,0x00,0x00] ; FMACALL32-NEXT: vmovhpd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf9,0x17,0x44,0x24,0x10] ; FMACALL32-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x50,0x01,0x00,0x00] -; FMACALL32-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload -; FMACALL32-NEXT: ## encoding: [0xc5,0xf9,0x15,0x84,0x24,0x40,0x01,0x00,0x00] -; FMACALL32-NEXT: ## xmm0 = xmm0[1],mem[1] +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x40,0x01,0x00,0x00] +; FMACALL32-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload +; FMACALL32-NEXT: ## encoding: [0xc5,0xf8,0x12,0x84,0x24,0x58,0x01,0x00,0x00] +; FMACALL32-NEXT: ## xmm0 = mem[0,1],xmm0[2,3] ; FMACALL32-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32-NEXT: fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x60] ; FMACALL32-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 4012910..4f0db48 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1310,8 +1310,7 @@ define <2 x double> @shuffle_mem_v2f64_31(<2 x double> %a, <2 x double>* %b) { ; ; AVX-LABEL: shuffle_mem_v2f64_31: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm1 -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] +; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; AVX-NEXT: retq %c = load <2 x double>, <2 x double>* %b %f = shufflevector <2 x double> %a, <2 x double> %c, <2 x i32> -- 2.7.4