OPCODE(OC);
OPCODE(XC);
OPCODE(CLC);
+ OPCODE(MEMSET_MVC);
OPCODE(STPCPY);
OPCODE(STRCMP);
OPCODE(SEARCH_STRING);
return MBB;
}
-MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
- MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
+MachineBasicBlock *
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode, bool IsMemset) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
uint64_t DestDisp = MI.getOperand(1).getImm();
- MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
- uint64_t SrcDisp = MI.getOperand(3).getImm();
- MachineOperand &LengthMO = MI.getOperand(4);
+ MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
+ uint64_t SrcDisp;
+
+ // Fold the displacement Disp if it is out of range.
+ auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
+ if (!isUInt<12>(Disp)) {
+ Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
+ .add(Base).addImm(Disp).addReg(0);
+ Base = MachineOperand::CreateReg(Reg, false);
+ Disp = 0;
+ }
+ };
+
+ if (!IsMemset) {
+ SrcBase = earlyUseOperand(MI.getOperand(2));
+ SrcDisp = MI.getOperand(3).getImm();
+ } else {
+ SrcBase = DestBase;
+ SrcDisp = DestDisp++;
+ foldDisplIfNeeded(DestBase, DestDisp);
+ }
+
+ MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
bool IsImmForm = LengthMO.isImm();
bool IsRegForm = !IsImmForm;
+ // Build and insert one Opcode of Length, with special treatment for memset.
+ auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
+ MachineBasicBlock::iterator InsPos,
+ MachineOperand DBase, uint64_t DDisp,
+ MachineOperand SBase, uint64_t SDisp,
+ unsigned Length) -> void {
+ assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
+ if (IsMemset) {
+ MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
+ if (ByteMO.isImm())
+ BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
+ .add(SBase).addImm(SDisp).add(ByteMO);
+ else
+ BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
+ .add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
+ if (--Length == 0)
+ return;
+ }
+ BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
+ .add(DBase).addImm(DDisp).addImm(Length)
+ .add(SBase).addImm(SDisp)
+ .setMemRefs(MI.memoperands());
+ };
+
bool NeedsLoop = false;
uint64_t ImmLength = 0;
- Register LenMinus1Reg = SystemZ::NoRegister;
+ Register LenAdjReg = SystemZ::NoRegister;
if (IsImmForm) {
ImmLength = LengthMO.getImm();
- ImmLength++; // Add back the '1' subtracted originally.
+ ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
if (ImmLength == 0) {
MI.eraseFromParent();
return MBB;
NeedsLoop = true;
} else {
NeedsLoop = true;
- LenMinus1Reg = LengthMO.getReg();
+ LenAdjReg = LengthMO.getReg();
}
// When generating more than one CLC, all but the last will need to
ImmLength &= 255;
} else {
BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
- .addReg(LenMinus1Reg)
+ .addReg(LenAdjReg)
.addReg(0)
.addImm(8);
}
+ bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
auto loadZeroAddress = [&]() -> MachineOperand {
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
return MachineOperand::CreateReg(Reg, false);
};
- bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
DestBase = loadZeroAddress();
if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
DoneMBB = SystemZ::emitBlockAfter(NextMBB);
// MBB:
- // # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
+ // # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
- .addReg(LenMinus1Reg).addImm(-1);
+ .addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
.addMBB(AllDoneMBB);
MBB->addSuccessor(AllDoneMBB);
- MBB->addSuccessor(StartMBB);
+ if (!IsMemset)
+ MBB->addSuccessor(StartMBB);
+ else {
+ // MemsetOneCheckMBB:
+ // # Jump to MemsetOneMBB for a memset of length 1, or
+ // # fall thru to StartMBB.
+ MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
+ MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
+ MBB->addSuccessor(MemsetOneCheckMBB);
+ MBB = MemsetOneCheckMBB;
+ BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+ .addReg(LenAdjReg).addImm(-1);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+ .addMBB(MemsetOneMBB);
+ MBB->addSuccessor(MemsetOneMBB, {10, 100});
+ MBB->addSuccessor(StartMBB, {90, 100});
+
+ // MemsetOneMBB:
+ // # Jump back to AllDoneMBB after a single MVI or STC.
+ MBB = MemsetOneMBB;
+ insertMemMemOp(MBB, MBB->end(),
+ MachineOperand::CreateReg(StartDestReg, false), DestDisp,
+ MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
+ 1);
+ BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
+ MBB->addSuccessor(AllDoneMBB);
+ }
// StartMBB:
// # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
if (Opcode == SystemZ::MVC)
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
.addImm(SystemZ::PFD_WRITE)
- .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
- BuildMI(MBB, DL, TII->get(Opcode))
- .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
- .addReg(ThisSrcReg).addImm(SrcDisp);
+ .addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
+ insertMemMemOp(MBB, MBB->end(),
+ MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
+ MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
if (EndMBB) {
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
// # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
// # Use EXecute Relative Long for the remainder of the bytes. The target
// instruction of the EXRL will have a length field of 1 since 0 is an
- // illegal value. The number of bytes processed becomes (%LenMinus1Reg &
+ // illegal value. The number of bytes processed becomes (%LenAdjReg &
// 0xff) + 1.
// # Fall through to AllDoneMBB.
Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
.addReg(StartSrcReg).addMBB(StartMBB)
.addReg(NextSrcReg).addMBB(NextMBB);
+ if (IsMemset)
+ insertMemMemOp(MBB, MBB->end(),
+ MachineOperand::CreateReg(RemDestReg, false), DestDisp,
+ MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
MachineInstrBuilder EXRL_MIB =
BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
.addImm(Opcode)
- .addReg(LenMinus1Reg)
+ .addReg(LenAdjReg)
.addReg(RemDestReg).addImm(DestDisp)
.addReg(RemSrcReg).addImm(SrcDisp);
MBB->addSuccessor(AllDoneMBB);
while (ImmLength > 0) {
uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
// The previous iteration might have created out-of-range displacements.
- // Apply them using LAY if so.
- if (!isUInt<12>(DestDisp)) {
- Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
- BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
- .add(DestBase)
- .addImm(DestDisp)
- .addReg(0);
- DestBase = MachineOperand::CreateReg(Reg, false);
- DestDisp = 0;
- }
- if (!isUInt<12>(SrcDisp)) {
- Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
- BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
- .add(SrcBase)
- .addImm(SrcDisp)
- .addReg(0);
- SrcBase = MachineOperand::CreateReg(Reg, false);
- SrcDisp = 0;
- }
- BuildMI(*MBB, MI, DL, TII->get(Opcode))
- .add(DestBase)
- .addImm(DestDisp)
- .addImm(ThisLength)
- .add(SrcBase)
- .addImm(SrcDisp)
- .setMemRefs(MI.memoperands());
+ // Apply them using LA/LAY if so.
+ foldDisplIfNeeded(DestBase, DestDisp);
+ foldDisplIfNeeded(SrcBase, SrcDisp);
+ insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
DestDisp += ThisLength;
SrcDisp += ThisLength;
ImmLength -= ThisLength;
case SystemZ::CLCImm:
case SystemZ::CLCReg:
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+ case SystemZ::MemsetImmImm:
+ case SystemZ::MemsetImmReg:
+ case SystemZ::MemsetRegImm:
+ case SystemZ::MemsetRegReg:
+ return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
case SystemZ::CLSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::CLST);
case SystemZ::MVSTLoop:
// as for MVC.
CLC,
+ // Use MVC to set a block of memory after storing the first byte.
+ MEMSET_MVC,
+
// Use an MVST-based sequence to implement stpcpy().
STPCPY,
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
- unsigned Opcode) const;
+ unsigned Opcode,
+ bool IsMemset = false) const;
MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode) const;
MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
let Constraints = "$R1 = $R1src";
}
+class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
+ : Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
+ [(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
+ let Defs = [CC];
+ let mayLoad = 1;
+ let mayStore = 1;
+ let usesCustomInserter = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
//===----------------------------------------------------------------------===//
// Multiclasses that emit both real and pseudo instructions
//===----------------------------------------------------------------------===//
def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
}
+// Memset[Length][Byte] pseudos.
+def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
+def MemsetImmReg : MemsetPseudo<imm64, GR32>;
+def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
+def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
+
// Move right.
let Predicates = [FeatureMiscellaneousExtensions3],
mayLoad = 1, mayStore = 1, Uses = [R0L] in
SDTCisPtrTy<1>,
SDTCisPtrTy<2>,
SDTCisVT<3, i64>]>;
+def SDT_ZMemsetMVC : SDTypeProfile<0, 3,
+ [SDTCisPtrTy<0>,
+ SDTCisVT<1, i64>,
+ SDTCisVT<2, i32>]>;
def SDT_ZString : SDTypeProfile<1, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
[SDNPHasChain, SDNPMayLoad]>;
+def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
[SDNPHasChain, SDNPMayLoad]>;
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,
#define DEBUG_TYPE "systemz-selectiondag-info"
-static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
- return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
- : DAG.getVTList(MVT::Other);
+static unsigned getMemMemLenAdj(unsigned Op) {
+ return Op == SystemZISD::MEMSET_MVC ? 2 : 1;
}
-// Emit a mem-mem operation after subtracting one from size, which will be
-// added back during pseudo expansion. As the Reg case emitted here may be
-// converted by DAGCombiner into having an Imm length, they are both emitted
-// the same way.
+static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
+ SDValue Chain, SDValue Dst, SDValue Src,
+ SDValue LenAdj, SDValue Byte) {
+ SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
+ : DAG.getVTList(MVT::Other);
+ SmallVector<SDValue, 6> Ops;
+ if (Op == SystemZISD::MEMSET_MVC)
+ Ops = { Chain, Dst, LenAdj, Byte };
+ else
+ Ops = { Chain, Dst, Src, LenAdj };
+ return DAG.getNode(Op, DL, VTs, Ops);
+}
+
+// Emit a mem-mem operation after subtracting one (or two for memset) from
+// size, which will be added back during pseudo expansion. As the Reg case
+// emitted here may be converted by DAGCombiner into having an Imm length,
+// they are both emitted the same way.
static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
- uint64_t Size) {
- return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
- DAG.getConstant(Size - 1, DL, Src.getValueType()));
+ uint64_t Size, SDValue Byte = SDValue()) {
+ unsigned Adj = getMemMemLenAdj(Op);
+ assert(Size >= Adj && "Adjusted length overflow.");
+ SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType());
+ return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
}
static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
- SDValue Size) {
- SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
- DAG.getZExtOrTrunc(Size, DL, MVT::i64),
- DAG.getConstant(-1, DL, MVT::i64));
- return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
+ SDValue Size, SDValue Byte = SDValue()) {
+ int64_t Adj = getMemMemLenAdj(Op);
+ SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
+ DAG.getZExtOrTrunc(Size, DL, MVT::i64),
+ DAG.getConstant(0 - Adj, DL, MVT::i64));
+ return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
}
SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
if (CByte && CByte->getZExtValue() == 0)
return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
- // Copy the byte to the first location and then use MVC to copy
- // it to the rest.
- Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
- SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
- DAG.getConstant(1, DL, PtrVT));
- return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
- Bytes - 1);
+ return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+ Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
}
// Variable length
// Handle the special case of a variable length memset of 0 with XC.
return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
- return SDValue();
+ return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
+ Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
}
// Convert the current CC value into an integer that is 0 if CC == 0,
define void @f9(i8* %dest, i8 %val) {
; CHECK-LABEL: f9:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false)
ret void
define void @f10(i8* %dest, i8 %val) {
; CHECK-LABEL: f10:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false)
ret void
define void @f11(i8* %dest, i8 %val) {
; CHECK-LABEL: f11:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false)
define void @f12(i8* %dest, i8 %val) {
; CHECK-LABEL: f12:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false)
define void @f13(i8* %dest, i8 %val) {
; CHECK-LABEL: f13:
; CHECK: stc %r3, 0(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
-; CHECK: mvc 257(256,%r2), 256(%r2)
-; CHECK: mvc 513(256,%r2), 512(%r2)
-; CHECK: mvc 769(256,%r2), 768(%r2)
-; CHECK: mvc 1025(256,%r2), 1024(%r2)
-; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: stc %r3, 256(%r2)
+; CHECK: mvc 257(255,%r2), 256(%r2)
+; CHECK: stc %r3, 512(%r2)
+; CHECK: mvc 513(255,%r2), 512(%r2)
+; CHECK: stc %r3, 768(%r2)
+; CHECK: mvc 769(255,%r2), 768(%r2)
+; CHECK: stc %r3, 1024(%r2)
+; CHECK: mvc 1025(255,%r2), 1024(%r2)
+; CHECK: stc %r3, 1280(%r2)
+; CHECK: mvc 1281(255,%r2), 1280(%r2)
; CHECK: br %r14
- call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false)
ret void
}
; Test the next size up, which uses a loop. We leave the other corner
-; cases to memcpy-01.ll.
+; cases to memcpy-01.ll and memset-07.ll.
define void @f14(i8* %dest, i8 %val) {
; CHECK-LABEL: f14:
-; CHECK: stc %r3, 0(%r2)
; CHECK: lghi [[COUNT:%r[0-5]]], 6
; CHECK: [[LABEL:\.L[^:]*]]:
-; CHECK: pfd 2, 769(%r2)
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: pfd 2, 768(%r2)
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: la %r2, 256(%r2)
; CHECK: brctg [[COUNT]], [[LABEL]]
-; CHECK: mvc 1(1,%r2), 0(%r2)
-; CHECK: br %r14
- call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
+; CHECK: stc %r3, 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
ret void
}
+
+; Test (no) folding of displacement: Begins with max(uint12) - 1.
+define void @f15(i8* %dest, i8 %val) {
+; CHECK-LABEL: f15:
+; CHECK-NOT: la {{.*}}%r2
+ %addr = getelementptr i8, i8* %dest, i64 4094
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
+ ret void
+}
+
+; Test folding of displacement: Begins with max(uint12).
+define void @f16(i8* %dest, i8 %val) {
+; CHECK-LABEL: f16:
+; CHECK-DAG: lay %r1, 4096(%r2)
+; CHECK-DAG: stc %r3, 4095(%r2)
+ %addr = getelementptr i8, i8* %dest, i64 4095
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
+ ret void
+}
+
+; Test folding of displacement with LA: First two ops are in range.
+define void @f17(i8* %dest, i8 %val) {
+; CHECK-LABEL: f17:
+; CHECK: stc %r3, 3583(%r2)
+; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2)
+; CHECK-NEXT: stc %r3, 3839(%r2)
+; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2)
+; CHECK-NEXT: lay %r1, 4096(%r2)
+; CHECK-NEXT: stc %r3, 4095(%r2)
+; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2)
+; CHECK-NEXT: br %r14
+ %addr = getelementptr i8, i8* %dest, i64 3583
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
+ ret void
+}
+
+; Test folding of displacement with LAY: First two ops are in range.
+define void @f18(i8* %dest, i8 %val) {
+; CHECK-LABEL: f18:
+; CHECK: stc %r3, 3584(%r2)
+; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2)
+; CHECK-NEXT: stc %r3, 3840(%r2)
+; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2)
+; CHECK-NEXT: lay %r1, 4097(%r2)
+; CHECK-NEXT: lay %r2, 4096(%r2)
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 0(1,%r1), 0(%r2)
+; CHECK-NEXT: br %r14
+ %addr = getelementptr i8, i8* %dest, i64 3584
+ call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
+ ret void
+}
+
define void @f13(i8* %dest) {
; CHECK-LABEL: f13:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false)
ret void
define void @f14(i8* %dest) {
; CHECK-LABEL: f14:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false)
ret void
define void @f15(i8* %dest) {
; CHECK-LABEL: f15:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false)
define void @f16(i8* %dest) {
; CHECK-LABEL: f16:
; CHECK: mvi 0(%r2), 128
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 128
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false)
define void @f37(i8* %dest) {
; CHECK-LABEL: f37:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false)
ret void
define void @f38(i8* %dest) {
; CHECK-LABEL: f38:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false)
ret void
define void @f39(i8* %dest) {
; CHECK-LABEL: f39:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false)
define void @f40(i8* %dest) {
; CHECK-LABEL: f40:
; CHECK: mvi 0(%r2), 255
-; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: mvi 256(%r2), 255
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false)
--- /dev/null
+; Test memset in cases where a loop is used.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind
+
+; Constant length: 6 iterations and 2 bytes remainder.
+define void @f1(i8* %dest, i8 %val) {
+; CHECK-LABEL: f1:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
+ ret void
+}
+
+; Constant length: 6 iterations and 255 bytes remainder.
+define void @f2(i8* %dest) {
+; CHECK-LABEL: f2:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvi 0(%r2), 1
+; CHECK: mvc 1(255,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvi 0(%r2), 1
+; CHECK-NEXT: mvc 1(254,%r2), 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false)
+ ret void
+}
+
+; Variable length, byte in register.
+define void @f3(i8* %dest, i8 %val, i64 %Len) {
+; CHECK-LABEL: f3:
+; CHECK: # %bb.0:
+; CHECK-NEXT: aghi %r4, -2
+; CHECK-NEXT: cgibe %r4, -2, 0(%r14)
+; CHECK-NEXT: .LBB2_1:
+; CHECK-NEXT: cgije %r4, -1, .LBB2_5
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT: srlg %r0, %r4, 8
+; CHECK-NEXT: cgije %r0, 0, .LBB2_4
+; CHECK-NEXT:.LBB2_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: pfd 2, 768(%r2)
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
+; CHECK-NEXT: la %r2, 256(%r2)
+; CHECK-NEXT: brctg %r0, .LBB2_3
+; CHECK-NEXT:.LBB2_4:
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: exrl %r4, .Ltmp0
+; CHECK-NEXT: br %r14
+; CHECK-NEXT:.LBB2_5:
+; CHECK-NEXT: stc %r3, 0(%r2)
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false)
+ ret void
+}
+
+; Variable length, immediate byte.
+define void @f4(i8* %dest, i32 %Len) {
+; CHECK-LABEL: f4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: llgfr %r1, %r3
+; CHECK-NEXT: aghi %r1, -2
+; CHECK-NEXT: cgibe %r1, -2, 0(%r14)
+; CHECK-NEXT:.LBB3_1:
+; CHECK-NEXT: cgije %r1, -1, .LBB3_5
+; CHECK-NEXT:# %bb.2:
+; CHECK-NEXT: srlg %r0, %r1, 8
+; CHECK-NEXT: cgije %r0, 0, .LBB3_4
+; CHECK-NEXT:.LBB3_3: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: pfd 2, 768(%r2)
+; CHECK-NEXT: mvi 0(%r2), 1
+; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
+; CHECK-NEXT: la %r2, 256(%r2)
+; CHECK-NEXT: brctg %r0, .LBB3_3
+; CHECK-NEXT:.LBB3_4:
+; CHECK-NEXT: mvi 0(%r2), 1
+; CHECK-NEXT: exrl %r1, .Ltmp0
+; CHECK-NEXT: br %r14
+; CHECK-NEXT:.LBB3_5:
+; CHECK-NEXT: mvi 0(%r2), 1
+; CHECK-NEXT: br %r14
+ call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false)
+ ret void
+}
+
+; CHECK: .Ltmp0:
+; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
; CHECK: jg memset
define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
entry:
- tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false)
+ tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 true)
ret void
}