From c6cbecc2c7ff32117a4ea4d30dfb62e95d45978b Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Wed, 20 Feb 2013 20:41:42 +0000 Subject: [PATCH] Additional fixes for bug 15155. This handles the cases where the 6-bit splat element is odd, converting to a three-instruction sequence to add or subtract two splats. With this fix, the XFAIL in test/CodeGen/PowerPC/vec_constants.ll is removed. llvm-svn: 175663 --- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 59 ++++++++++++++--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 35 ++++------- llvm/lib/Target/PowerPC/PPCISelLowering.h | 5 +- llvm/test/CodeGen/PowerPC/vaddsplat.ll | 98 +++++++++++++++++++++++++---- llvm/test/CodeGen/PowerPC/vec_constants.ll | 1 - 5 files changed, 149 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 01d731a..1453506 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -1323,34 +1323,75 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) { SDValue(Tmp, 0), GA); } case PPCISD::VADD_SPLAT: { - // Convert: VADD_SPLAT elt, size - // Into: tmp = VSPLTIS[BHW] elt - // VADDU[BHW]M tmp, tmp - // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4 + // This expands into one of three sequences, depending on whether + // the first operand is odd or even, positive or negative. assert(isa(N->getOperand(0)) && isa(N->getOperand(1)) && "Invalid operand on VADD_SPLAT!"); + + int Elt = N->getConstantOperandVal(0); int EltSize = N->getConstantOperandVal(1); - unsigned Opc1, Opc2; + unsigned Opc1, Opc2, Opc3; EVT VT; + if (EltSize == 1) { Opc1 = PPC::VSPLTISB; Opc2 = PPC::VADDUBM; + Opc3 = PPC::VSUBUBM; VT = MVT::v16i8; } else if (EltSize == 2) { Opc1 = PPC::VSPLTISH; Opc2 = PPC::VADDUHM; + Opc3 = PPC::VSUBUHM; VT = MVT::v8i16; } else { assert(EltSize == 4 && "Invalid element size on VADD_SPLAT!"); Opc1 = PPC::VSPLTISW; Opc2 = PPC::VADDUWM; + Opc3 = PPC::VSUBUWM; VT = MVT::v4i32; } - SDValue Elt = getI32Imm(N->getConstantOperandVal(0)); - SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, Elt); - SDValue TmpVal = SDValue(Tmp, 0); - return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal); + + if ((Elt & 1) == 0) { + // Elt is even, in the range [-32,-18] + [16,30]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp = VSPLTIS[BHW] elt + // VADDU[BHW]M tmp, tmp + // Where: [BHW] = B for size = 1, H for size = 2, W for size = 4 + SDValue EltVal = getI32Imm(Elt >> 1); + SDNode *Tmp = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + SDValue TmpVal = SDValue(Tmp, 0); + return CurDAG->getMachineNode(Opc2, dl, VT, TmpVal, TmpVal); + + } else if (Elt > 0) { + // Elt is odd and positive, in the range [17,31]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp1 = VSPLTIS[BHW] elt-16 + // tmp2 = VSPLTIS[BHW] -16 + // VSUBU[BHW]M tmp1, tmp2 + SDValue EltVal = getI32Imm(Elt - 16); + SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + EltVal = getI32Imm(-16); + SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + return CurDAG->getMachineNode(Opc3, dl, VT, SDValue(Tmp1, 0), + SDValue(Tmp2, 0)); + + } else { + // Elt is odd and negative, in the range [-31,-17]. + // + // Convert: VADD_SPLAT elt, size + // Into: tmp1 = VSPLTIS[BHW] elt+16 + // tmp2 = VSPLTIS[BHW] -16 + // VADDU[BHW]M tmp1, tmp2 + SDValue EltVal = getI32Imm(Elt + 16); + SDNode *Tmp1 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + EltVal = getI32Imm(-16); + SDNode *Tmp2 = CurDAG->getMachineNode(Opc1, dl, VT, EltVal); + return CurDAG->getMachineNode(Opc2, dl, VT, SDValue(Tmp1, 0), + SDValue(Tmp2, 0)); + } } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 338d73f..6d2aacd 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -5025,11 +5025,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Two instruction sequences. // If this value is in the range [-32,30] and is even, use: - // tmp = VSPLTI[bhw], result = add tmp, tmp - if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) { - // To avoid having the optimization undone by constant folding, we - // convert to a pseudo that will be expanded later. - SDValue Elt = DAG.getConstant(SextVal >> 1, MVT::i32); + // VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2) + // If this value is in the range [17,31] and is odd, use: + // VSPLTI[bhw](val-16) - VSPLTI[bhw](-16) + // If this value is in the range [-31,-17] and is odd, use: + // VSPLTI[bhw](val+16) + VSPLTI[bhw](-16) + // Note the last two are three-instruction sequences. + if (SextVal >= -32 && SextVal <= 31) { + // To avoid having these optimizations undone by constant folding, + // we convert to a pseudo that will be expanded later into one of + // the above forms. + SDValue Elt = DAG.getConstant(SextVal, MVT::i32); EVT VT = Op.getValueType(); int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4); SDValue EltSize = DAG.getConstant(Size, MVT::i32); @@ -5129,25 +5135,6 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, } } - // Three instruction sequences. - - // Odd, in range [17,31]: (vsplti C)-(vsplti -16). - // FIXME: Disabled because the add gets constant folded. - if (0 && SextVal >= 0 && SextVal <= 31) { - SDValue LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG, dl); - SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl); - LHS = DAG.getNode(ISD::SUB, dl, LHS.getValueType(), LHS, RHS); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), LHS); - } - // Odd, in range [-31,-17]: (vsplti C)+(vsplti -16). - // FIXME: Disabled because the add gets constant folded. - if (0 && SextVal >= -31 && SextVal <= 0) { - SDValue LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG, dl); - SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl); - LHS = DAG.getNode(ISD::ADD, dl, LHS.getValueType(), LHS, RHS); - return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), LHS); - } - return SDValue(); } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 7cc2d1a..1fa88f3 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -238,8 +238,9 @@ namespace llvm { ADDI_DTPREL_L, /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded - /// into an ADD of a VSPLTI with itself during instruction selection. - /// Necessary to avoid losing this optimization due to constant folds. + /// during instruction selection to optimize a BUILD_VECTOR into + /// operations on splats. This is necessary to avoid losing these + /// optimizations due to constant folding. VADD_SPLAT, /// STD_32 - This is the STD instruction for use with "32-bit" registers. diff --git a/llvm/test/CodeGen/PowerPC/vaddsplat.ll b/llvm/test/CodeGen/PowerPC/vaddsplat.ll index b4c16c1..e65148a 100644 --- a/llvm/test/CodeGen/PowerPC/vaddsplat.ll +++ b/llvm/test/CodeGen/PowerPC/vaddsplat.ll @@ -1,6 +1,6 @@ ; RUN: llc -O0 -mcpu=pwr7 <%s | FileCheck %s -; Test optimization of build_vector into vadd/vsplt for 6-bit immediates. +; Test optimizations of build_vector for 6-bit immediates. target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64" target triple = "powerpc64-unknown-linux-gnu" @@ -9,69 +9,141 @@ target triple = "powerpc64-unknown-linux-gnu" %v8i16 = type <8 x i16> %v16i8 = type <16 x i8> -define void @test_v4i32_pos(%v4i32* %P, %v4i32* %S) { +define void @test_v4i32_pos_even(%v4i32* %P, %v4i32* %S) { %p = load %v4i32* %P %r = add %v4i32 %p, < i32 18, i32 18, i32 18, i32 18 > store %v4i32 %r, %v4i32* %S ret void } -; CHECK: test_v4i32_pos: +; CHECK: test_v4i32_pos_even: ; CHECK: vspltisw [[REG1:[0-9]+]], 9 ; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG1]] -define void @test_v4i32_neg(%v4i32* %P, %v4i32* %S) { +define void @test_v4i32_neg_even(%v4i32* %P, %v4i32* %S) { %p = load %v4i32* %P %r = add %v4i32 %p, < i32 -28, i32 -28, i32 -28, i32 -28 > store %v4i32 %r, %v4i32* %S ret void } -; CHECK: test_v4i32_neg: +; CHECK: test_v4i32_neg_even: ; CHECK: vspltisw [[REG1:[0-9]+]], -14 ; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG1]] -define void @test_v8i16_pos(%v8i16* %P, %v8i16* %S) { +define void @test_v8i16_pos_even(%v8i16* %P, %v8i16* %S) { %p = load %v8i16* %P %r = add %v8i16 %p, < i16 30, i16 30, i16 30, i16 30, i16 30, i16 30, i16 30, i16 30 > store %v8i16 %r, %v8i16* %S ret void } -; CHECK: test_v8i16_pos: +; CHECK: test_v8i16_pos_even: ; CHECK: vspltish [[REG1:[0-9]+]], 15 ; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG1]] -define void @test_v8i16_neg(%v8i16* %P, %v8i16* %S) { +define void @test_v8i16_neg_even(%v8i16* %P, %v8i16* %S) { %p = load %v8i16* %P %r = add %v8i16 %p, < i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32, i16 -32 > store %v8i16 %r, %v8i16* %S ret void } -; CHECK: test_v8i16_neg: +; CHECK: test_v8i16_neg_even: ; CHECK: vspltish [[REG1:[0-9]+]], -16 ; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG1]] -define void @test_v16i8_pos(%v16i8* %P, %v16i8* %S) { +define void @test_v16i8_pos_even(%v16i8* %P, %v16i8* %S) { %p = load %v16i8* %P %r = add %v16i8 %p, < i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16 > store %v16i8 %r, %v16i8* %S ret void } -; CHECK: test_v16i8_pos: +; CHECK: test_v16i8_pos_even: ; CHECK: vspltisb [[REG1:[0-9]+]], 8 ; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG1]] -define void @test_v16i8_neg(%v16i8* %P, %v16i8* %S) { +define void @test_v16i8_neg_even(%v16i8* %P, %v16i8* %S) { %p = load %v16i8* %P %r = add %v16i8 %p, < i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18, i8 -18 > store %v16i8 %r, %v16i8* %S ret void } -; CHECK: test_v16i8_neg: +; CHECK: test_v16i8_neg_even: ; CHECK: vspltisb [[REG1:[0-9]+]], -9 ; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG1]] +define void @test_v4i32_pos_odd(%v4i32* %P, %v4i32* %S) { + %p = load %v4i32* %P + %r = add %v4i32 %p, < i32 27, i32 27, i32 27, i32 27 > + store %v4i32 %r, %v4i32* %S + ret void +} + +; CHECK: test_v4i32_pos_odd: +; CHECK: vspltisw [[REG2:[0-9]+]], -16 +; CHECK: vspltisw [[REG1:[0-9]+]], 11 +; CHECK: vsubuwm {{[0-9]+}}, [[REG1]], [[REG2]] + +define void @test_v4i32_neg_odd(%v4i32* %P, %v4i32* %S) { + %p = load %v4i32* %P + %r = add %v4i32 %p, < i32 -27, i32 -27, i32 -27, i32 -27 > + store %v4i32 %r, %v4i32* %S + ret void +} + +; CHECK: test_v4i32_neg_odd: +; CHECK: vspltisw [[REG2:[0-9]+]], -16 +; CHECK: vspltisw [[REG1:[0-9]+]], -11 +; CHECK: vadduwm {{[0-9]+}}, [[REG1]], [[REG2]] + +define void @test_v8i16_pos_odd(%v8i16* %P, %v8i16* %S) { + %p = load %v8i16* %P + %r = add %v8i16 %p, < i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31, i16 31 > + store %v8i16 %r, %v8i16* %S + ret void +} + +; CHECK: test_v8i16_pos_odd: +; CHECK: vspltish [[REG2:[0-9]+]], -16 +; CHECK: vspltish [[REG1:[0-9]+]], 15 +; CHECK: vsubuhm {{[0-9]+}}, [[REG1]], [[REG2]] + +define void @test_v8i16_neg_odd(%v8i16* %P, %v8i16* %S) { + %p = load %v8i16* %P + %r = add %v8i16 %p, < i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31, i16 -31 > + store %v8i16 %r, %v8i16* %S + ret void +} + +; CHECK: test_v8i16_neg_odd: +; CHECK: vspltish [[REG2:[0-9]+]], -16 +; CHECK: vspltish [[REG1:[0-9]+]], -15 +; CHECK: vadduhm {{[0-9]+}}, [[REG1]], [[REG2]] + +define void @test_v16i8_pos_odd(%v16i8* %P, %v16i8* %S) { + %p = load %v16i8* %P + %r = add %v16i8 %p, < i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17 > + store %v16i8 %r, %v16i8* %S + ret void +} + +; CHECK: test_v16i8_pos_odd: +; CHECK: vspltisb [[REG2:[0-9]+]], -16 +; CHECK: vspltisb [[REG1:[0-9]+]], 1 +; CHECK: vsububm {{[0-9]+}}, [[REG1]], [[REG2]] + +define void @test_v16i8_neg_odd(%v16i8* %P, %v16i8* %S) { + %p = load %v16i8* %P + %r = add %v16i8 %p, < i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17, i8 -17 > + store %v16i8 %r, %v16i8* %S + ret void +} + +; CHECK: test_v16i8_neg_odd: +; CHECK: vspltisb [[REG2:[0-9]+]], -16 +; CHECK: vspltisb [[REG1:[0-9]+]], -1 +; CHECK: vaddubm {{[0-9]+}}, [[REG1]], [[REG2]] + diff --git a/llvm/test/CodeGen/PowerPC/vec_constants.ll b/llvm/test/CodeGen/PowerPC/vec_constants.ll index 71aa359..56b41b0 100644 --- a/llvm/test/CodeGen/PowerPC/vec_constants.ll +++ b/llvm/test/CodeGen/PowerPC/vec_constants.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s -march=ppc32 -mcpu=g5 | FileCheck %s -; XFAIL: * define void @test1(<4 x i32>* %P1, <4 x i32>* %P2, <4 x float>* %P3) nounwind { %tmp = load <4 x i32>* %P1 ; <<4 x i32>> [#uses=1] -- 2.7.4