ReplaceNode(Node, Extract.getNode());
return;
}
+ case RISCVISD::VMV_V_X_VL:
+ case RISCVISD::VFMV_V_F_VL: {
+ // Try to match splat of a scalar load to a strided load with stride of x0.
+ SDValue Src = Node->getOperand(0);
+ auto *Ld = dyn_cast<LoadSDNode>(Src);
+ if (!Ld)
+ break;
+ EVT MemVT = Ld->getMemoryVT();
+ // The memory VT should be the same size as the element type.
+ if (MemVT.getStoreSize() != VT.getVectorElementType().getStoreSize())
+ break;
+ if (!IsProfitableToFold(Src, Node, Node) ||
+ !IsLegalToFold(Src, Node, Node, TM.getOptLevel()))
+ break;
+
+ SDValue VL;
+ selectVLOp(Node->getOperand(1), VL);
+
+ unsigned ScalarSize = VT.getScalarSizeInBits();
+ SDValue SEW = CurDAG->getTargetConstant(ScalarSize, DL, XLenVT);
+
+ SDValue Operands[] = {Ld->getBasePtr(),
+ CurDAG->getRegister(RISCV::X0, XLenVT), VL, SEW,
+ Ld->getChain()};
+
+ RISCVVLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+ const RISCV::VLEPseudo *P = RISCV::getVLEPseudo(
+ /*IsMasked*/ false, /*IsStrided*/ true, /*FF*/ false, ScalarSize,
+ static_cast<unsigned>(LMUL));
+ MachineSDNode *Load =
+ CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
+
+ if (auto *MemOp = dyn_cast<MemSDNode>(Node))
+ CurDAG->setNodeMemRefs(Load, {MemOp->getMemOperand()});
+
+ ReplaceNode(Node, Load);
+ return;
+ }
}
// Select the default instruction.
SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(),
TypeSize::Fixed(Offset), DL);
- SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
- SDValue IntID =
- DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
- SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
- DAG.getRegister(RISCV::X0, XLenVT), VL};
- SDValue NewLoad = DAG.getMemIntrinsicNode(
- ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
- DAG.getMachineFunction().getMachineMemOperand(
- Ld->getMemOperand(), Offset, SVT.getStoreSize()));
- DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
- return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
+ // If this is SEW=64 on RV32, use a strided load with a stride of x0.
+ if (SVT.isInteger() && SVT.bitsGT(XLenVT)) {
+ SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
+ SDValue IntID =
+ DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
+ SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
+ DAG.getRegister(RISCV::X0, XLenVT), VL};
+ SDValue NewLoad = DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
+ DAG.getMachineFunction().getMachineMemOperand(
+ Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
+ return convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
+ }
+
+ // Otherwise use a scalar load and splat. This will give the best
+ // opportunity to fold a splat into the operation. ISel can turn it into
+ // the x0 strided load if we aren't able to fold away the select.
+ if (SVT.isFloatingPoint())
+ V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
+ Ld->getPointerInfo().getWithOffset(Offset),
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ else
+ V = DAG.getExtLoad(ISD::SEXTLOAD, DL, XLenVT, Ld->getChain(), NewAddr,
+ Ld->getPointerInfo().getWithOffset(Offset), SVT,
+ Ld->getOriginalAlign(),
+ Ld->getMemOperand()->getFlags());
+ DAG.makeEquivalentMemoryOrdering(Ld, V);
+
+ unsigned Opc =
+ VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL;
+ SDValue Splat = DAG.getNode(Opc, DL, ContainerVT, V, VL);
+ return convertFromScalableVector(VT, Splat, DAG, Subtarget);
}
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
define void @buildvec_dominant0_v4f32(<4 x float>* %x) {
; CHECK-LABEL: buildvec_dominant0_v4f32:
; CHECK: # %bb.0:
-; CHECK-NEXT: lui a1, %hi(.LCPI1_0)
-; CHECK-NEXT: flw ft0, %lo(.LCPI1_0)(a1)
-; CHECK-NEXT: fmv.w.x ft1, zero
; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
-; CHECK-NEXT: vfmv.s.f v25, ft1
-; CHECK-NEXT: vfmv.v.f v26, ft0
+; CHECK-NEXT: lui a1, %hi(.LCPI1_0)
+; CHECK-NEXT: addi a1, a1, %lo(.LCPI1_0)
+; CHECK-NEXT: vlse32.v v25, (a1), zero
+; CHECK-NEXT: fmv.w.x ft0, zero
+; CHECK-NEXT: vfmv.s.f v26, ft0
; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu
-; CHECK-NEXT: vslideup.vi v26, v25, 2
+; CHECK-NEXT: vslideup.vi v25, v26, 2
; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu
-; CHECK-NEXT: vse32.v v26, (a0)
+; CHECK-NEXT: vse32.v v25, (a0)
; CHECK-NEXT: ret
store <4 x float> <float 2.0, float 2.0, float 0.0, float 2.0>, <4 x float>* %x
ret void
; RV32-LABEL: vrgather_shuffle_xv_v4f64:
; RV32: # %bb.0:
; RV32-NEXT: addi a0, zero, 12
-; RV32-NEXT: lui a1, %hi(.LCPI7_0)
-; RV32-NEXT: fld ft0, %lo(.LCPI7_0)(a1)
; RV32-NEXT: vsetivli a1, 1, e8,m1,ta,mu
; RV32-NEXT: vmv.s.x v0, a0
-; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vfmv.v.f v26, ft0
+; RV32-NEXT: lui a0, %hi(.LCPI7_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0)
+; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV32-NEXT: vlse64.v v26, (a0), zero
; RV32-NEXT: lui a0, %hi(.LCPI7_1)
; RV32-NEXT: addi a0, a0, %lo(.LCPI7_1)
; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
; RV64-NEXT: vmv.s.x v0, a0
; RV64-NEXT: lui a0, %hi(.LCPI7_0)
; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0)
-; RV64-NEXT: lui a1, %hi(.LCPI7_1)
-; RV64-NEXT: fld ft0, %lo(.LCPI7_1)(a1)
; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
; RV64-NEXT: vle64.v v28, (a0)
-; RV64-NEXT: vfmv.v.f v26, ft0
+; RV64-NEXT: lui a0, %hi(.LCPI7_1)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI7_1)
+; RV64-NEXT: vlse64.v v26, (a0), zero
; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t
; RV64-NEXT: vmv2r.v v8, v26
; RV32-NEXT: vsetivli a1, 4, e16,m1,ta,mu
; RV32-NEXT: vmv.s.x v25, a0
; RV32-NEXT: vmv.v.i v28, 0
-; RV32-NEXT: lui a0, %hi(.LCPI8_0)
-; RV32-NEXT: fld ft0, %lo(.LCPI8_0)(a0)
; RV32-NEXT: vsetivli a0, 2, e16,m1,tu,mu
; RV32-NEXT: vslideup.vi v28, v25, 1
-; RV32-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV32-NEXT: vfmv.v.f v26, ft0
+; RV32-NEXT: lui a0, %hi(.LCPI8_0)
+; RV32-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV32-NEXT: vlse64.v v26, (a0), zero
; RV32-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV32-NEXT: vrgatherei16.vv v26, v8, v28, v0.t
; RV32-NEXT: vmv2r.v v8, v26
; RV64-NEXT: vmv.v.i v28, 0
; RV64-NEXT: vsetivli a1, 2, e64,m2,tu,mu
; RV64-NEXT: vslideup.vi v28, v26, 1
-; RV64-NEXT: lui a1, %hi(.LCPI8_0)
-; RV64-NEXT: fld ft0, %lo(.LCPI8_0)(a1)
; RV64-NEXT: vsetivli a1, 1, e8,m1,ta,mu
; RV64-NEXT: vmv.s.x v0, a0
-; RV64-NEXT: vsetivli a0, 4, e64,m2,ta,mu
-; RV64-NEXT: vfmv.v.f v26, ft0
+; RV64-NEXT: lui a0, %hi(.LCPI8_0)
+; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu
+; RV64-NEXT: vlse64.v v26, (a0), zero
; RV64-NEXT: vsetivli a0, 4, e64,m2,tu,mu
; RV64-NEXT: vrgather.vv v26, v8, v28, v0.t
; RV64-NEXT: vmv2r.v v8, v26
; RV32-NEXT: vmv.v.i v28, 0
; RV32-NEXT: vsetivli a3, 2, e64,m2,tu,mu
; RV32-NEXT: vslideup.vi v28, v26, 0
-; RV32-NEXT: lw a3, 20(a0)
+; RV32-NEXT: addi a3, a0, 20
; RV32-NEXT: vsetivli a4, 4, e32,m1,ta,mu
-; RV32-NEXT: lw a4, 16(a0)
-; RV32-NEXT: vmv.v.x v26, a3
-; RV32-NEXT: vmv.s.x v26, a4
+; RV32-NEXT: vlse32.v v26, (a3), zero
+; RV32-NEXT: lw a3, 16(a0)
+; RV32-NEXT: vmv.s.x v26, a3
; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu
; RV32-NEXT: vslideup.vi v28, v26, 2
; RV32-NEXT: vsetivli a3, 2, e32,m2,ta,mu