return;
}
}
+ case PPCISD::LD_SPLAT: {
+ // For v16i8 and v8i16, if target has no direct move, we can still handle
+ // this without using stack.
+ if (Subtarget->hasAltivec() && !Subtarget->hasDirectMove()) {
+ SDValue ZeroReg =
+ CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO,
+ Subtarget->isPPC64() ? MVT::i64 : MVT::i32);
+ unsigned LIOpcode = Subtarget->isPPC64() ? PPC::LI8 : PPC::LI;
+ EVT Type = N->getValueType(0);
+ if (Type == MVT::v16i8 || Type == MVT::v8i16) {
+ // v16i8 LD_SPLAT addr
+ // ======>
+ // Mask = LVSR/LVSL 0, addr
+ // LoadLow = LXV 0, addr
+ // Perm = VPERM LoadLow, LoadLow, Mask
+ // Splat = VSPLTB 15/0, Perm
+ //
+ // v8i16 LD_SPLAT addr
+ // ======>
+ // Mask = LVSR/LVSL 0, addr
+ // LoadLow = LXV 0, addr
+ // LoadHigh = LXV (LI, 1), addr
+ // Perm = VPERM LoadLow, LoadHigh, Mask
+ // Splat = VSPLTH 7/0, Perm
+ unsigned SplatOp = (Type == MVT::v16i8) ? PPC::VSPLTB : PPC::VSPLTH;
+ unsigned SplatElemIndex =
+ Subtarget->isLittleEndian() ? ((Type == MVT::v16i8) ? 15 : 7) : 0;
+
+ SDNode *Mask = CurDAG->getMachineNode(
+ Subtarget->isLittleEndian() ? PPC::LVSR : PPC::LVSL, dl, Type,
+ ZeroReg, N->getOperand(1));
+
+ SDNode *LoadLow = CurDAG->getMachineNode(
+ PPC::LVX, dl, MVT::v16i8, MVT::Other,
+ {ZeroReg, N->getOperand(1), N->getOperand(0)});
+
+ SDNode *LoadHigh = LoadLow;
+ if (Type == MVT::v8i16) {
+ LoadHigh = CurDAG->getMachineNode(
+ PPC::LVX, dl, MVT::v16i8, MVT::Other,
+ {SDValue(CurDAG->getMachineNode(
+ LIOpcode, dl, MVT::i32,
+ CurDAG->getTargetConstant(1, dl, MVT::i8)),
+ 0),
+ N->getOperand(1), SDValue(LoadLow, 1)});
+ }
+
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(LoadHigh, 1));
+ transferMemOperands(N, LoadHigh);
+
+ SDNode *Perm =
+ CurDAG->getMachineNode(PPC::VPERM, dl, Type, SDValue(LoadLow, 0),
+ SDValue(LoadHigh, 0), SDValue(Mask, 0));
+ CurDAG->SelectNodeTo(
+ N, SplatOp, Type,
+ CurDAG->getTargetConstant(SplatElemIndex, dl, MVT::i8),
+ SDValue(Perm, 0));
+ return;
+ }
+ }
+ break;
+ }
}
SelectCode(N);
case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
case PPCISD::XXMFACC: return "PPCISD::XXMFACC";
case PPCISD::LD_SPLAT: return "PPCISD::LD_SPLAT";
+ case PPCISD::ZEXT_LD_SPLAT: return "PPCISD::ZEXT_LD_SPLAT";
+ case PPCISD::SEXT_LD_SPLAT: return "PPCISD::SEXT_LD_SPLAT";
case PPCISD::FNMSUB: return "PPCISD::FNMSUB";
case PPCISD::STRICT_FADDRTZ:
return "PPCISD::STRICT_FADDRTZ";
return (!LosesInfo && !APFloatToConvert.isDenormal());
}
+static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
+ unsigned &Opcode) {
+ const SDNode *InputNode = Op.getOperand(0).getNode();
+ if (!InputNode || !ISD::isUNINDEXEDLoad(InputNode))
+ return false;
+
+ if (!Subtarget.hasVSX())
+ return false;
+
+ EVT Ty = Op->getValueType(0);
+ if (Ty == MVT::v2f64 || Ty == MVT::v4f32 || Ty == MVT::v4i32 ||
+ Ty == MVT::v8i16 || Ty == MVT::v16i8)
+ return true;
+
+ if (Ty == MVT::v2i64) {
+ // check the extend type if the input is i32 while the output vector type is
+ // v2i64.
+ if (cast<LoadSDNode>(Op.getOperand(0))->getMemoryVT() == MVT::i32) {
+ if (ISD::isZEXTLoad(InputNode))
+ Opcode = PPCISD::ZEXT_LD_SPLAT;
+ if (ISD::isSEXTLoad(InputNode))
+ Opcode = PPCISD::SEXT_LD_SPLAT;
+ }
+ return true;
+ }
+ return false;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it. If we CAN select this case, and if it
// selects to a single instruction, return Op. Otherwise, if we can codegen
}
if (!BVNIsConstantSplat || SplatBitSize > 32) {
+ unsigned NewOpcode = PPCISD::LD_SPLAT;
- bool IsPermutedLoad = false;
- const SDValue *InputLoad =
- getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
// Handle load-and-splat patterns as we have instructions that will do this
// in one go.
- if (InputLoad && DAG.isSplatValue(Op, true)) {
+ if (DAG.isSplatValue(Op, true) &&
+ isValidSplatLoad(Subtarget, Op, NewOpcode)) {
+ const SDValue *InputLoad = &Op.getOperand(0);
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
- // We have handling for 4 and 8 byte elements.
- unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits();
+ unsigned ElementSize = LD->getMemoryVT().getScalarSizeInBits() *
+ ((NewOpcode == PPCISD::LD_SPLAT) ? 1 : 2);
// Checking for a single use of this load, we have to check for vector
// width (128 bits) / ElementSize uses (since each operand of the
for (SDValue BVInOp : Op->ops())
if (BVInOp.isUndef())
NumUsesOfInputLD--;
+
+ // Execlude somes case where LD_SPLAT is worse than scalar_to_vector:
+ // Below cases should also happen for "lfiwzx/lfiwax + LE target + index
+ // 1" and "lxvrhx + BE target + index 7" and "lxvrbx + BE target + index
+ // 15", but funciton IsValidSplatLoad() now will only return true when
+ // the data at index 0 is not nullptr. So we will not get into trouble for
+ // these cases.
+ //
+ // case 1 - lfiwzx/lfiwax
+ // 1.1: load result is i32 and is sign/zero extend to i64;
+ // 1.2: build a v2i64 vector type with above loaded value;
+ // 1.3: the vector has only one value at index 0, others are all undef;
+ // 1.4: on BE target, so that lfiwzx/lfiwax does not need any permute.
+ if (NumUsesOfInputLD == 1 &&
+ (Op->getValueType(0) == MVT::v2i64 && NewOpcode != PPCISD::LD_SPLAT &&
+ !Subtarget.isLittleEndian() && Subtarget.hasVSX() &&
+ Subtarget.hasLFIWAX()))
+ return SDValue();
+
+ // case 2 - lxvrhx
+ // 2.1: load result is i16;
+ // 2.2: build a v8i16 vector with above loaded value;
+ // 2.3: the vector has only one value at index 0, others are all undef;
+ // 2.4: on LE target, so that lxvrhx does not need any permute.
+ if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
+ Subtarget.isISA3_1() && Op->getValueType(0) == MVT::v16i8)
+ return SDValue();
+
+ // case 3 - lxvrbx
+ // 3.1: load result is i8;
+ // 3.2: build a v16i8 vector with above loaded value;
+ // 3.3: the vector has only one value at index 0, others are all undef;
+ // 3.4: on LE target, so that lxvrbx does not need any permute.
+ if (NumUsesOfInputLD == 1 && Subtarget.isLittleEndian() &&
+ Subtarget.isISA3_1() && Op->getValueType(0) == MVT::v8i16)
+ return SDValue();
+
assert(NumUsesOfInputLD > 0 && "No uses of input LD of a build_vector?");
if (InputLoad->getNode()->hasNUsesOfValue(NumUsesOfInputLD, 0) &&
- ((Subtarget.hasVSX() && ElementSize == 64) ||
- (Subtarget.hasP9Vector() && ElementSize == 32))) {
+ Subtarget.hasVSX()) {
SDValue Ops[] = {
LD->getChain(), // Chain
LD->getBasePtr(), // Ptr
DAG.getValueType(Op.getValueType()) // VT
};
SDValue LdSplt = DAG.getMemIntrinsicNode(
- PPCISD::LD_SPLAT, dl, DAG.getVTList(Op.getValueType(), MVT::Other),
- Ops, LD->getMemoryVT(), LD->getMemOperand());
+ NewOpcode, dl, DAG.getVTList(Op.getValueType(), MVT::Other), Ops,
+ LD->getMemoryVT(), LD->getMemOperand());
// Replace all uses of the output chain of the original load with the
// output chain of the new load.
DAG.ReplaceAllUsesOfValueWith(InputLoad->getValue(1),
/// instructions such as LXVDSX, LXVWSX.
LD_SPLAT,
+ /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+ /// that zero-extends.
+ ZEXT_LD_SPLAT,
+
+ /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+ /// that sign-extends.
+ SEXT_LD_SPLAT,
+
/// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
/// Maps directly to an stxvd2x instruction that will be preceded by
/// an xxswapd.
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
SDTypeProfile<1, 1, []>, []>;
def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
(v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+
+// Splat loads.
def : Pat<(v2f64 (PPCldsplat ForceXForm:$A)),
(v2f64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v4f32 (PPCldsplat ForceXForm:$A)),
+ (v4f32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>;
def : Pat<(v2i64 (PPCldsplat ForceXForm:$A)),
(v2i64 (LXVDSX ForceXForm:$A))>;
+def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
+ (v4i32 (XXSPLTW (SUBREG_TO_REG (i64 1), (LFIWZX ForceXForm:$A), sub_64), 1))>;
+def : Pat<(v2i64 (PPCzextldsplat ForceXForm:$A)),
+ (v2i64 (XXPERMDIs (LFIWZX ForceXForm:$A), 0))>;
+def : Pat<(v2i64 (PPCsextldsplat ForceXForm:$A)),
+ (v2i64 (XXPERMDIs (LFIWAX ForceXForm:$A), 0))>;
// Build vectors of floating point converted to i64.
def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
immSExt5NonZero:$A, immSExt5NonZero:$A)),
(v4i32 (VSPLTISW imm:$A))>;
+
+// Splat loads.
+// Note that, we use MTVSRD without checking PPC64 because we only care the
+// lowest 16/8 bits.
+def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
+ (v8i16 (VSPLTHs 3, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LHZX ForceXForm:$A), sub_32))))>;
+def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)),
+ (v16i8 (VSPLTBs 7, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LBZX ForceXForm:$A), sub_32))))>;
} // HasVSX, HasDirectMove
// Big endian VSX subtarget with direct moves.
(v4f32 (LXVWSX ForceXForm:$A))>;
def : Pat<(v4i32 (PPCldsplat ForceXForm:$A)),
(v4i32 (LXVWSX ForceXForm:$A))>;
+def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)),
+ (v8i16 (VSPLTHs 3, (LXSIHZX ForceXForm:$A)))>;
+def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)),
+ (v16i8 (VSPLTBs 7, (LXSIBZX ForceXForm:$A)))>;
} // HasVSX, HasP9Vector
// Any Power9 VSX subtarget with equivalent length but better Power10 VSX
ToErase = &MI;
Simplified = true;
}
- } else if ((Immed == 0 || Immed == 3) && DefOpc == PPC::XXPERMDIs &&
+ } else if ((Immed == 0 || Immed == 3 || Immed == 2) &&
+ DefOpc == PPC::XXPERMDIs &&
(DefMI->getOperand(2).getImm() == 0 ||
DefMI->getOperand(2).getImm() == 3)) {
+ ToErase = &MI;
+ Simplified = true;
+ // Swap of a splat, convert to copy.
+ if (Immed == 2) {
+ LLVM_DEBUG(dbgs() << "Optimizing swap(splat) => copy(splat): ");
+ LLVM_DEBUG(MI.dump());
+ BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(PPC::COPY),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(1));
+ break;
+ }
// Splat fed by another splat - switch the output of the first
// and remove the second.
DefMI->getOperand(0).setReg(MI.getOperand(0).getReg());
- ToErase = &MI;
- Simplified = true;
LLVM_DEBUG(dbgs() << "Removing redundant splat: ");
LLVM_DEBUG(MI.dump());
}
define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(i32* nocapture readonly %ptr, i32 signext %offset) local_unnamed_addr #0 {
; CHECK-P8-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P8: # %bb.0: # %entry
-; CHECK-P8-NEXT: addis r5, r2, .LCPI16_0@toc@ha
; CHECK-P8-NEXT: sldi r4, r4, 2
-; CHECK-P8-NEXT: xxlxor v4, v4, v4
-; CHECK-P8-NEXT: addi r5, r5, .LCPI16_0@toc@l
-; CHECK-P8-NEXT: lxsiwzx v2, r3, r4
-; CHECK-P8-NEXT: lvx v3, 0, r5
-; CHECK-P8-NEXT: vperm v2, v4, v2, v3
+; CHECK-P8-NEXT: xxlxor v3, v3, v3
+; CHECK-P8-NEXT: lfiwzx f0, r3, r4
+; CHECK-P8-NEXT: xxspltd v2, f0, 0
+; CHECK-P8-NEXT: vmrglb v2, v3, v2
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: sldi r4, r4, 2
-; CHECK-P9-NEXT: xxlxor v4, v4, v4
-; CHECK-P9-NEXT: lxsiwzx v2, r3, r4
-; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha
-; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l
-; CHECK-P9-NEXT: lxv v3, 0(r3)
-; CHECK-P9-NEXT: vperm v2, v4, v2, v3
+; CHECK-P9-NEXT: xxlxor v3, v3, v3
+; CHECK-P9-NEXT: lfiwzx f0, r3, r4
+; CHECK-P9-NEXT: xxspltd v2, f0, 0
+; CHECK-P9-NEXT: vmrglb v2, v3, v2
; CHECK-P9-NEXT: blr
;
; CHECK-P9-BE-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P7-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P7: # %bb.0: # %entry
; CHECK-P7-NEXT: sldi r4, r4, 2
-; CHECK-P7-NEXT: addi r5, r1, -16
; CHECK-P7-NEXT: xxlxor v3, v3, v3
-; CHECK-P7-NEXT: lwzx r3, r3, r4
-; CHECK-P7-NEXT: std r3, -16(r1)
-; CHECK-P7-NEXT: lxvd2x vs0, 0, r5
-; CHECK-P7-NEXT: xxswapd v2, vs0
+; CHECK-P7-NEXT: lfiwzx f0, r3, r4
+; CHECK-P7-NEXT: xxspltd v2, f0, 0
; CHECK-P7-NEXT: vmrglb v2, v3, v2
; CHECK-P7-NEXT: blr
entry:
define dso_local void @testByteSplat() #0 {
; CHECK-P8-LABEL: testByteSplat:
; CHECK-P8: # %bb.0: # %entry
-; CHECK-P8-NEXT: lbz r3, 0(r3)
+; CHECK-P8-NEXT: lbzx r3, 0, r3
; CHECK-P8-NEXT: mtvsrd v2, r3
; CHECK-P8-NEXT: vspltb v2, v2, 7
; CHECK-P8-NEXT: stvx v2, 0, r3
;
; CHECK-P7-LABEL: testByteSplat:
; CHECK-P7: # %bb.0: # %entry
-; CHECK-P7-NEXT: lbz r3, 0(r3)
-; CHECK-P7-NEXT: stb r3, -16(r1)
-; CHECK-P7-NEXT: addi r3, r1, -16
-; CHECK-P7-NEXT: lvx v2, 0, r3
+; CHECK-P7-NEXT: lvsr v2, 0, r3
+; CHECK-P7-NEXT: lvx v3, 0, r3
+; CHECK-P7-NEXT: vperm v2, v3, v3, v2
; CHECK-P7-NEXT: vspltb v2, v2, 15
; CHECK-P7-NEXT: stvx v2, 0, r3
; CHECK-P7-NEXT: blr
;
; P7-LABEL: test2:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwz r4, 12(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: stw r4, -16(r1)
-; P7-NEXT: lxvw4x vs0, 0, r5
-; P7-NEXT: xxspltw vs0, vs0, 0
+; P7-NEXT: addi r4, r4, 12
+; P7-NEXT: lfiwzx f0, 0, r4
+; P7-NEXT: xxspltw vs0, vs0, 1
; P7-NEXT: stxvw4x vs0, 0, r3
; P7-NEXT: blr
entry:
;
; P7-LABEL: test3:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwz r4, 12(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: stw r4, -16(r1)
-; P7-NEXT: lxvw4x vs0, 0, r5
-; P7-NEXT: xxspltw vs0, vs0, 0
+; P7-NEXT: addi r4, r4, 12
+; P7-NEXT: lfiwzx f0, 0, r4
+; P7-NEXT: xxspltw vs0, vs0, 1
; P7-NEXT: stxvw4x vs0, 0, r3
; P7-NEXT: blr
entry:
ret void
}
+
; v2i64
define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a) local_unnamed_addr {
; P9-LABEL: test4:
; P9-LABEL: test5:
; P9: # %bb.0: # %entry
; P9-NEXT: lfiwax f0, 0, r4
-; P9-NEXT: xxspltd vs0, vs0, 0
+; P9-NEXT: xxspltd vs0, f0, 0
; P9-NEXT: stxv vs0, 0(r3)
; P9-NEXT: blr
;
; P8-LABEL: test5:
; P8: # %bb.0: # %entry
; P8-NEXT: lfiwax f0, 0, r4
-; P8-NEXT: xxspltd vs0, vs0, 0
+; P8-NEXT: xxspltd vs0, f0, 0
; P8-NEXT: stxvd2x vs0, 0, r3
; P8-NEXT: blr
;
; P7-LABEL: test5:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwa r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: std r4, -8(r1)
-; P7-NEXT: std r4, -16(r1)
-; P7-NEXT: lxvd2x vs0, 0, r5
+; P7-NEXT: lfiwax f0, 0, r4
+; P7-NEXT: xxspltd vs0, f0, 0
; P7-NEXT: stxvd2x vs0, 0, r3
; P7-NEXT: blr
entry:
; P9-LABEL: test6:
; P9: # %bb.0: # %entry
; P9-NEXT: lfiwzx f0, 0, r4
-; P9-NEXT: xxspltd vs0, vs0, 0
+; P9-NEXT: xxspltd vs0, f0, 0
; P9-NEXT: stxv vs0, 0(r3)
; P9-NEXT: blr
;
; P8-LABEL: test6:
; P8: # %bb.0: # %entry
; P8-NEXT: lfiwzx f0, 0, r4
-; P8-NEXT: xxspltd vs0, vs0, 0
+; P8-NEXT: xxspltd vs0, f0, 0
; P8-NEXT: stxvd2x vs0, 0, r3
; P8-NEXT: blr
;
; P7-LABEL: test6:
; P7: # %bb.0: # %entry
-; P7-NEXT: lwz r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: std r4, -8(r1)
-; P7-NEXT: std r4, -16(r1)
-; P7-NEXT: lxvd2x vs0, 0, r5
+; P7-NEXT: lfiwzx f0, 0, r4
+; P7-NEXT: xxspltd vs0, f0, 0
; P7-NEXT: stxvd2x vs0, 0, r3
; P7-NEXT: blr
entry:
;
; P8-LABEL: test7:
; P8: # %bb.0: # %entry
-; P8-NEXT: lhz r4, 0(r4)
+; P8-NEXT: lhzx r4, 0, r4
; P8-NEXT: mtvsrd v2, r4
; P8-NEXT: vsplth v2, v2, 3
; P8-NEXT: stvx v2, 0, r3
;
; P7-LABEL: test7:
; P7: # %bb.0: # %entry
-; P7-NEXT: lhz r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: sth r4, -16(r1)
-; P7-NEXT: lxvw4x v2, 0, r5
+; P7-NEXT: li r5, 1
+; P7-NEXT: lvx v2, 0, r4
+; P7-NEXT: lvsl v4, 0, r4
+; P7-NEXT: lvx v3, r5, r4
+; P7-NEXT: vperm v2, v2, v3, v4
; P7-NEXT: vsplth v2, v2, 0
; P7-NEXT: stxvw4x v2, 0, r3
; P7-NEXT: blr
;
; P8-LABEL: test8:
; P8: # %bb.0: # %entry
-; P8-NEXT: lbz r4, 0(r4)
+; P8-NEXT: lbzx r4, 0, r4
; P8-NEXT: mtvsrd v2, r4
; P8-NEXT: vspltb v2, v2, 7
; P8-NEXT: stvx v2, 0, r3
;
; P7-LABEL: test8:
; P7: # %bb.0: # %entry
-; P7-NEXT: lbz r4, 0(r4)
-; P7-NEXT: addi r5, r1, -16
-; P7-NEXT: stb r4, -16(r1)
-; P7-NEXT: lxvw4x v2, 0, r5
+; P7-NEXT: lvsl v2, 0, r4
+; P7-NEXT: lvx v3, 0, r4
+; P7-NEXT: vperm v2, v3, v3, v2
; P7-NEXT: vspltb v2, v2, 0
; P7-NEXT: stxvw4x v2, 0, r3
; P7-NEXT: blr
; P9LE-LABEL: s2v_test6:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: lfiwax f0, 0, r3
-; P9LE-NEXT: xxspltd v2, vs0, 0
+; P9LE-NEXT: xxspltd v2, f0, 0
; P9LE-NEXT: blr
;
; P9BE-LABEL: s2v_test6:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfiwax f0, 0, r3
-; P9BE-NEXT: xxspltd v2, vs0, 0
+; P9BE-NEXT: xxspltd v2, f0, 0
; P9BE-NEXT: blr
;
; P8LE-LABEL: s2v_test6:
; P8LE: # %bb.0: # %entry
; P8LE-NEXT: lfiwax f0, 0, r3
-; P8LE-NEXT: xxspltd v2, vs0, 0
+; P8LE-NEXT: xxspltd v2, f0, 0
; P8LE-NEXT: blr
;
; P8BE-LABEL: s2v_test6:
; P8BE: # %bb.0: # %entry
; P8BE-NEXT: lfiwax f0, 0, r3
-; P8BE-NEXT: xxspltd v2, vs0, 0
+; P8BE-NEXT: xxspltd v2, f0, 0
; P8BE-NEXT: blr
; P9LE-LABEL: s2v_test7:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: lfiwax f0, 0, r3
-; P9LE-NEXT: xxspltd v2, vs0, 0
+; P9LE-NEXT: xxspltd v2, f0, 0
; P9LE-NEXT: blr
;
; P9BE-LABEL: s2v_test7:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfiwax f0, 0, r3
-; P9BE-NEXT: xxspltd v2, vs0, 0
+; P9BE-NEXT: xxspltd v2, f0, 0
; P9BE-NEXT: blr
;
; P8LE-LABEL: s2v_test7:
; P8LE: # %bb.0: # %entry
; P8LE-NEXT: lfiwax f0, 0, r3
-; P8LE-NEXT: xxspltd v2, vs0, 0
+; P8LE-NEXT: xxspltd v2, f0, 0
; P8LE-NEXT: blr
;
; P8BE-LABEL: s2v_test7:
; P8BE: # %bb.0: # %entry
; P8BE-NEXT: lfiwax f0, 0, r3
-; P8BE-NEXT: xxspltd v2, vs0, 0
+; P8BE-NEXT: xxspltd v2, f0, 0
; P8BE-NEXT: blr