MachinePointerInfo DstPtrInfo) const {
return SDValue();
}
+
+ /// EmitTargetCodeForMemcmp - Emit target-specific code that performs a
+ /// memcmp, in cases where that is faster than a libcall. The first
+ /// returned SDValue is the result of the memcmp and the second is
+ /// the chain. Both SDValues can be null if a normal libcall should
+ /// be used.
+ virtual std::pair<SDValue, SDValue>
+ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc dl,
+ SDValue Chain,
+ SDValue Op1, SDValue Op2,
+ SDValue Op3, MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const {
+ return std::make_pair(SDValue(), SDValue());
+ }
};
} // end llvm namespace
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
#include <algorithm>
using namespace llvm;
return false;
const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+ if (Size && Size->getZExtValue() == 0) {
+ EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+ setValue(&I, DAG.getConstant(0, CallVT));
+ return true;
+ }
+
+ const Value *Arg0 = I.getArgOperand(0);
+ const Value *Arg1 = I.getArgOperand(1);
+ const Value *Arg2 = I.getArgOperand(2);
+ const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+ std::pair<SDValue, SDValue> Res =
+ TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+ getValue(Arg0), getValue(Arg1), getValue(Arg2),
+ MachinePointerInfo(Arg0),
+ MachinePointerInfo(Arg1));
+ if (Res.first.getNode()) {
+ setValue(&I, Res.first);
+ DAG.setRoot(Res.second);
+ return true;
+ }
// memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS) != 0
// memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS) != 0
--
We don't optimize block memory operations, except using single MVCs
-for memcpy.
+for memcpy and single CLCs for memcmp.
-It's definitely worth using things like CLC, NC, XC and OC with
+It's definitely worth using things like NC, XC and OC with
constant lengths. MVCIN may be worthwhile too.
-We should probably implement things like memcpy using MVC with EXECUTE.
+We should probably implement general memcpy using MVC with EXECUTE.
Likewise memcmp and CLC. MVCLE and CLCLE could be useful too.
--
OPCODE(UDIVREM64);
OPCODE(MVC);
OPCODE(CLC);
+ OPCODE(IPM);
OPCODE(ATOMIC_SWAPW);
OPCODE(ATOMIC_LOADW_ADD);
OPCODE(ATOMIC_LOADW_SUB);
}
MachineBasicBlock *
-SystemZTargetLowering::emitMVCWrapper(MachineInstr *MI,
- MachineBasicBlock *MBB) const {
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
+ MachineBasicBlock *MBB,
+ unsigned Opcode) const {
const SystemZInstrInfo *TII = TM.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
uint64_t SrcDisp = MI->getOperand(3).getImm();
uint64_t Length = MI->getOperand(4).getImm();
- BuildMI(*MBB, MI, DL, TII->get(SystemZ::MVC))
+ BuildMI(*MBB, MI, DL, TII->get(Opcode))
.addOperand(DestBase).addImm(DestDisp).addImm(Length)
.addOperand(SrcBase).addImm(SrcDisp);
case SystemZ::ATOMIC_CMP_SWAPW:
return emitAtomicCmpSwapW(MI, MBB);
case SystemZ::MVCWrapper:
- return emitMVCWrapper(MI, MBB);
+ return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+ case SystemZ::CLCWrapper:
+ return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
default:
llvm_unreachable("Unexpected instr type to insert");
}
// as for MVC.
CLC,
+ // Store the CC value in bits 29 and 28 of an integer.
+ IPM,
+
// Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
// ATOMIC_LOAD_<op>.
//
unsigned BitSize) const;
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
MachineBasicBlock *BB) const;
- MachineBasicBlock *emitMVCWrapper(MachineInstr *MI,
- MachineBasicBlock *BB) const;
+ MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
+ MachineBasicBlock *BB,
+ unsigned Opcode) const;
};
} // end namespace llvm
return Count;
}
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
+ unsigned &SrcReg, unsigned &SrcReg2,
+ int &Mask, int &Value) const {
+ assert(MI->isCompare() && "Caller should have checked for a comparison");
+
+ if (MI->getNumExplicitOperands() == 2 &&
+ MI->getOperand(0).isReg() &&
+ MI->getOperand(1).isImm()) {
+ SrcReg = MI->getOperand(0).getReg();
+ SrcReg2 = 0;
+ Value = MI->getOperand(1).getImm();
+ Mask = ~0;
+ return true;
+ }
+
+ return false;
+}
+
+// If Reg is a virtual register that is used by only a single non-debug
+// instruction, return the defining instruction, otherwise return null.
+static MachineInstr *getDefSingleUse(const MachineRegisterInfo *MRI,
+ unsigned Reg) {
+ if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ return 0;
+
+ MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg);
+ MachineRegisterInfo::use_nodbg_iterator E = MRI->use_nodbg_end();
+ if (I == E || llvm::next(I) != E)
+ return 0;
+
+ return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
+ return (MI->getOpcode() == Opcode &&
+ !MI->getOperand(2).getReg() &&
+ MI->getOperand(3).getImm() == Imm);
+}
+
+// Compare compares SrcReg against zero. Check whether SrcReg contains
+// the result of an IPM sequence that is only used by Compare. Try to
+// delete both of them if so and return true if a change was made.
+static bool removeIPM(MachineInstr *Compare, unsigned SrcReg,
+ const MachineRegisterInfo *MRI,
+ const TargetRegisterInfo *TRI) {
+ MachineInstr *SRA = getDefSingleUse(MRI, SrcReg);
+ if (!SRA || !isShift(SRA, SystemZ::SRA, 30))
+ return false;
+
+ MachineInstr *SLL = getDefSingleUse(MRI, SRA->getOperand(1).getReg());
+ if (!SLL || !isShift(SLL, SystemZ::SLL, 2))
+ return false;
+
+ MachineInstr *IPM = getDefSingleUse(MRI, SLL->getOperand(1).getReg());
+ if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+ return false;
+
+ // Check that there are no assignments to CC between the IPM and Compare,
+ // except for the SRA that we'd like to delete. We can ignore SLL because
+ // it does not assign to CC. We can also ignore uses of the SRA CC result,
+ // since it is effectively restoring CC to the value it had before IPM
+ // (for all current use cases).
+ if (IPM->getParent() != Compare->getParent())
+ return false;
+ MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+ for (++MBBI; MBBI != MBBE; ++MBBI) {
+ MachineInstr *MI = MBBI;
+ if (MI != SRA && MI->modifiesRegister(SystemZ::CC, TRI))
+ return false;
+ }
+
+ IPM->eraseFromParent();
+ SLL->eraseFromParent();
+ SRA->eraseFromParent();
+ Compare->eraseFromParent();
+ return true;
+}
+
+bool
+SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
+ unsigned SrcReg, unsigned SrcReg2,
+ int Mask, int Value,
+ const MachineRegisterInfo *MRI) const {
+ assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+ bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+ if (Value == 0 &&
+ !IsLogical &&
+ removeIPM(Compare, SrcReg, MRI, TM.getRegisterInfo()))
+ return true;
+ return false;
+}
+
// If Opcode is a move that has a conditional variant, return that variant,
// otherwise return 0.
static unsigned getConditionalMove(unsigned Opcode) {
MachineBasicBlock *FBB,
const SmallVectorImpl<MachineOperand> &Cond,
DebugLoc DL) const LLVM_OVERRIDE;
+ bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+ unsigned &SrcReg2, int &Mask, int &Value) const
+ LLVM_OVERRIDE;
+ bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+ unsigned SrcReg2, int Mask, int Value,
+ const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,
// Extract CC into bits 29 and 28 of a register.
let Uses = [CC] in
- def IPM : InherentRRE<"ipm", 0xB222, GR32, (null_frag)>;
+ def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;
// Read a 32-bit access register into a GR32. As with all GR32 operations,
// the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
SDTCisVT<2, i32>]>;
+def SDT_ZI32Intrinsic : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
//===----------------------------------------------------------------------===//
// Node definitions
def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
- [SDNPHasChain, SDNPMayLoad]>;
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+ [SDNPInGlue]>;
//===----------------------------------------------------------------------===//
// Pattern fragments
}
return SDValue();
}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+ SDValue Src1, SDValue Src2, SDValue Size,
+ MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const {
+ if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+ uint64_t Bytes = CSize->getZExtValue();
+ if (Bytes >= 1 && Bytes <= 0x100) {
+ // A single CLC.
+ SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain,
+ Src1, Src2, Size);
+ SDValue Glue = Chain.getValue(1);
+ // IPM inserts the CC value into bits 29 and 28, with 0 meaning "equal",
+ // 1 meaning "greater" and 2 meaning "less". Convert them into an
+ // integer that is respectively equal, greater or less than 0.
+ SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, IPM,
+ DAG.getConstant(2, MVT::i32));
+ SDValue SRA = DAG.getNode(ISD::SRA, DL, MVT::i32, SHL,
+ DAG.getConstant(30, MVT::i32));
+ return std::make_pair(SRA, Chain);
+ }
+ }
+ return std::make_pair(SDValue(), SDValue());
+}
EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
SDValue Chain, SDValue Dst, SDValue Byte,
SDValue Size, unsigned Align, bool IsVolatile,
- MachinePointerInfo DstPtrInfo) const;
+ MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
+
+ virtual std::pair<SDValue, SDValue>
+ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+ SDValue Src1, SDValue Src2, SDValue Size,
+ MachinePointerInfo Op1PtrInfo,
+ MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
};
}
--- /dev/null
+; Test memcmp using CLC.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+declare signext i32 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
+
+; Zero-length comparisons should be optimized away.
+define i32 @f1(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f1:
+; CHECK: lhi %r2, 0
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 0)
+ ret i32 %res
+}
+
+; Check a case where the result is used as an integer.
+define i32 @f2(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f2:
+; CHECK: clc 0(2,%r2), 0(%r3)
+; CHECK: ipm %r2
+; CHECK: sll %r2, 2
+; CHECK: sra %r2, 30
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 2)
+ ret i32 %res
+}
+
+; Check a case where the result is tested for equality.
+define void @f3(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: clc 0(3,%r2), 0(%r3)
+; CHECK-NEXT: je {{\..*}}
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 3)
+ %cmp = icmp eq i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check a case where the result is tested for inequality.
+define void @f4(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f4:
+; CHECK: clc 0(4,%r2), 0(%r3)
+; CHECK-NEXT: jlh {{\..*}}
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 4)
+ %cmp = icmp ne i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check a case where the result is tested via slt.
+define void @f5(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f5:
+; CHECK: clc 0(5,%r2), 0(%r3)
+; CHECK-NEXT: jl {{\..*}}
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 5)
+ %cmp = icmp slt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check a case where the result is tested for sgt.
+define void @f6(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f6:
+; CHECK: clc 0(6,%r2), 0(%r3)
+; CHECK-NEXT: jh {{\..*}}
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 6)
+ %cmp = icmp sgt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret void
+}
+
+; Check the upper end of the CLC range. Here the result is used both as
+; an integer and for branching, but it's better to branch on the result
+; of the SRA.
+define i32 @f7(i8 *%src1, i8 *%src2, i32 *%dest) {
+; CHECK-LABEL: f7:
+; CHECK: clc 0(256,%r2), 0(%r3)
+; CHECK: ipm %r2
+; CHECK: sll %r2, 2
+; CHECK: sra %r2, 30
+; CHECK: jl {{.L*}}
+; CHECK: br %r14
+entry:
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 256)
+ %cmp = icmp slt i32 %res, 0
+ br i1 %cmp, label %exit, label %store
+
+store:
+ store i32 0, i32 *%dest
+ br label %exit
+
+exit:
+ ret i32 %res
+}
+
+; 257 bytes is too big for a single CLC. For now expect a call instead.
+define i32 @f8(i8 *%src1, i8 *%src2) {
+; CHECK-LABEL: f8:
+; CHECK: brasl %r14, memcmp@PLT
+; CHECK: br %r14
+ %res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
+ ret i32 %res
+}