From 1358d866593a6745dc510dcaf7c01c5cba9a7000 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 1 Feb 2016 08:59:30 +0000 Subject: [PATCH] [X86][SSE] Find source of the inserted element of INSERTPS Minor patch to trace back through target shuffles to the source of the inserted element in a (V)INSERTPS shuffle. Differential Revision: http://reviews.llvm.org/D16652 llvm-svn: 259343 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 33 +++++++++++++++++++++++++++++---- llvm/test/CodeGen/X86/sse41.ll | 19 ++++--------------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 687879a..1c87add 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4317,6 +4317,11 @@ static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } +/// Val is either the undef or zero sentinel value. +static bool isUndefOrZero(int Val) { + return (Val == SM_SentinelUndef || Val == SM_SentinelZero); +} + /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. @@ -23989,6 +23994,7 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, SDValue Op1 = N.getOperand(1); SDValue Op2 = N.getOperand(2); unsigned InsertPSMask = cast(Op2)->getZExtValue(); + unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; @@ -24002,19 +24008,38 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); - SmallVector TargetMask; - if (!setTargetShuffleZeroElements(Op0, TargetMask)) + // Attempt to merge insertps Op1 with an inner target shuffle node. + SmallVector TargetMask1; + if (setTargetShuffleZeroElements(Op1, TargetMask1)) { + int M = TargetMask1[SrcIdx]; + if (isUndefOrZero(M)) { + // Zero/UNDEF insertion - zero out element and remove dependency. + InsertPSMask |= (1u << DstIdx); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + // Update insertps mask srcidx and reference the source input directly. + assert(0 <= M && M < 8 && "Shuffle index out of range"); + InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); + Op1 = Op1.getOperand(M < 4 ? 0 : 1); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + + // Attempt to merge insertps Op0 with an inner target shuffle node. + SmallVector TargetMask0; + if (!setTargetShuffleZeroElements(Op0, TargetMask0)) return SDValue(); bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { - int M = TargetMask[i]; + int M = TargetMask0[i]; if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; - } else if (M < 0) { + } else if (isUndefOrZero(M)) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 03538fea..a8e83bf 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -846,16 +846,12 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadf32: ; X64: ## BB#0: -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 @@ -871,16 +867,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ; X32-LABEL: insertps_from_broadcast_loadv4f32: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movups (%eax), %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadv4f32: ; X64: ## BB#0: -; X64-NEXT: movups (%rdi), %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 @@ -892,14 +884,12 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float ret <4 x float> %7 } -;; FIXME: We're emitting an extraneous pshufd/vbroadcast. define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { ; X32-LABEL: insertps_from_broadcast_multiple_use: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] @@ -912,7 +902,6 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-LABEL: insertps_from_broadcast_multiple_use: ; X64: ## BB#0: ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] -- 2.7.4