From b4ba3cbda01e710e64948f43cbf9bfdec5ec5855 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 6 Oct 2019 21:11:45 +0000 Subject: [PATCH] [X86][AVX] Access a scalar float/double as a free extract from a broadcast load (PR43217) If a fp scalar is loaded and then used as both a scalar and a vector broadcast, perform the load as a broadcast and then extract the scalar for 'free' from the 0th element. This involved switching the order of the X86ISD::BROADCAST combines so we only convert to X86ISD::BROADCAST_LOAD once all other canonicalizations have been attempted. Adds a DAGCombinerInfo::recursivelyDeleteUnusedNodes wrapper. Fixes PR43217 Differential Revision: https://reviews.llvm.org/D68544 llvm-svn: 373871 --- llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 +++ llvm/lib/Target/X86/X86ISelLowering.cpp | 35 +++++++++++++------- llvm/test/CodeGen/X86/avx-vbroadcast.ll | 47 +++++++++------------------ 4 files changed, 47 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 999c0ea..a5dfb8b 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3263,6 +3263,8 @@ public: SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true); SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true); + bool recursivelyDeleteUnusedNodes(SDNode *N); + void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO); }; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 38fd974..7ea9084 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -761,6 +761,11 @@ CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); } +bool TargetLowering::DAGCombinerInfo:: +recursivelyDeleteUnusedNodes(SDNode *N) { + return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N); +} + void TargetLowering::DAGCombinerInfo:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 784bf6d..9150460 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -33429,8 +33429,19 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); + // Share broadcast with the longest vector and extract low subvector (free). + for (SDNode *User : Src->uses()) + if (User != N.getNode() && + (User->getOpcode() == X86ISD::VBROADCAST || + User->getOpcode() == X86ISD::VBROADCAST_LOAD) && + User->getValueSizeInBits(0) > VT.getSizeInBits()) { + return extractSubVector(SDValue(User, 0), 0, DAG, DL, + VT.getSizeInBits()); + } + // vbroadcast(scalarload X) -> vbroadcast_load X - if (!SrcVT.isVector() && Src.hasOneUse() && + // For float loads, extract other uses of the scalar from the broadcast. + if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) && ISD::isNormalLoad(Src.getNode())) { LoadSDNode *LN = cast(Src); SDVTList Tys = DAG.getVTList(VT, MVT::Other); @@ -33438,17 +33449,19 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SDValue BcastLd = DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, LN->getMemoryVT(), LN->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); - return BcastLd; - } - - // Share broadcast with the longest vector and extract low subvector (free). - for (SDNode *User : Src->uses()) - if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && - User->getValueSizeInBits(0) > VT.getSizeInBits()) { - return extractSubVector(SDValue(User, 0), 0, DAG, DL, - VT.getSizeInBits()); + // If the load value is used only by N, replace it via CombineTo N. + bool NoReplaceExtract = Src.hasOneUse(); + DCI.CombineTo(N.getNode(), BcastLd); + if (NoReplaceExtract) { + DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); + DCI.recursivelyDeleteUnusedNodes(LN); + } else { + SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd, + DAG.getIntPtrConstant(0, DL)); + DCI.CombineTo(LN, Scl, BcastLd.getValue(1)); } + return N; // Return N so it doesn't get rechecked! + } return SDValue(); } diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index d3a261e..609c02e 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -159,18 +159,14 @@ define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone s ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovsd %xmm0, (%eax) -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-NEXT: vbroadcastsd (%ecx), %ymm0 +; X32-NEXT: vmovlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: C2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovsd %xmm0, (%rsi) -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-NEXT: vbroadcastsd (%rdi), %ymm0 +; X64-NEXT: vmovlps %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load double, double* %ptr, align 8 @@ -231,18 +227,14 @@ define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vbroadcastss (%ecx), %ymm0 ; X32-NEXT: vmovss %xmm0, (%eax) -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: D3: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vbroadcastss (%rdi), %ymm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq entry: %q = load float, float* %ptr, align 4 @@ -285,16 +277,14 @@ define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vbroadcastss (%ecx), %xmm0 ; X32-NEXT: vmovss %xmm0, (%eax) -; X32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X32-NEXT: retl ; ; X64-LABEL: e2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vbroadcastss (%rdi), %xmm0 ; X64-NEXT: vmovss %xmm0, (%rsi) -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: retq entry: %q = load float, float* %ptr, align 4 @@ -669,16 +659,14 @@ define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone s ; X32: ## %bb.0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovsd %xmm0, (%eax) -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vmovlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: I2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovsd %xmm0, (%rsi) -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vmovlps %xmm0, (%rsi) ; X64-NEXT: retq entry: %q = load double, double* %ptr, align 4 @@ -884,7 +872,6 @@ define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) { ; ; Broadcast scale factor for xyz vector - slp will have vectorized xy. -; FIXME: Load as a broadcast and then use the scalar 0'th element. ; define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture readonly) nounwind { ; X32-LABEL: broadcast_scale_xyz: @@ -892,9 +879,8 @@ define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture ; X32-NEXT: subl $12, %esp ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; X32-NEXT: vmulpd (%eax), %xmm1, %xmm1 +; X32-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: vmulpd (%eax), %xmm0, %xmm1 ; X32-NEXT: vmulsd 16(%eax), %xmm0, %xmm0 ; X32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; X32-NEXT: vaddsd %xmm2, %xmm1, %xmm1 @@ -906,9 +892,8 @@ define double @broadcast_scale_xyz(double* nocapture readonly, double* nocapture ; ; X64-LABEL: broadcast_scale_xyz: ; X64: ## %bb.0: -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] -; X64-NEXT: vmulpd (%rsi), %xmm1, %xmm1 +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: vmulpd (%rsi), %xmm0, %xmm1 ; X64-NEXT: vmulsd 16(%rsi), %xmm0, %xmm0 ; X64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm1 -- 2.7.4