The IR/MIR pseudo probe intrinsics don't get materialized into real machine instructions and therefore they don't incur runtime cost directly. However, they come with indirect cost by blocking certain optimizations. Some of the blocking are intentional (such as blocking code merge) for better counts quality while the others are accidental. This change unblocks perf-critical optimizations that do not affect counts quality. They include:
1. IR InstCombine, sinking load operation to shorten lifetimes.
2. MIR LiveRangeShrink, similar to #1
3. MIR TwoAddressInstructionPass, i.e, opeq transform
4. MIR function argument copy elision
5. IR stack protection. (though not perf-critical but nice to have).
Reviewed By: wmi
Differential Revision: https://reviews.llvm.org/D95982
return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
}
+ bool isPseudoProbe() const {
+ return getOpcode() == TargetOpcode::PSEUDO_PROBE;
+ }
+
// True if the instruction represents a position in the function.
bool isPosition() const { return isLabel() || isCFIInstruction(); }
bool isDebugInstr() const {
return isDebugValue() || isDebugLabel() || isDebugRef();
}
+ bool isDebugOrPseudoInstr() const {
+ return isDebugInstr() || isPseudoProbe();
+ }
bool isDebugOffsetImm() const { return getDebugOffset().isImm(); }
/// llvm.lifetime.end marker.
bool isLifetimeStartOrEnd() const;
+ /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
+ bool isDebugOrPseudoInst() const;
+
/// Return a pointer to the next non-debug instruction in the same basic
/// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
/// operations if \c SkipPseudoOp is true.
// If MI has side effects, it should become a barrier for code motion.
// IOM is rebuild from the next instruction to prevent later
// instructions from being moved before this MI.
- if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
+ if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
+ Next != MBB.end()) {
BuildInstOrderMap(Next, IOM);
SawStore = false;
}
}
bool MachineInstr::isLoadFoldBarrier() const {
- return mayStore() || isCall() || hasUnmodeledSideEffects();
+ return mayStore() || isCall() ||
+ (hasUnmodeledSideEffects() && !isPseudoProbe());
}
/// allDefsAreDead - Return true if all the defs of this instruction are dead.
// We will look through cast uses, so ignore them completely.
if (I.isCast())
continue;
- // Ignore debug info intrinsics, they don't escape or store to allocas.
- if (isa<DbgInfoIntrinsic>(I))
+ // Ignore debug info and pseudo op intrinsics, they don't escape or store
+ // to allocas.
+ if (I.isDebugOrPseudoInst())
continue;
// This is an unknown instruction. Assume it escapes or writes to all
// static alloca operands.
// Ignore intrinsics that do not become real instructions.
// TODO: Narrow this to intrinsics that have store-like effects.
const auto *CI = cast<CallInst>(I);
- if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
+ if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
return true;
break;
}
MachineBasicBlock::iterator KillPos = KillMI;
++KillPos;
for (MachineInstr &OtherMI : make_range(End, KillPos)) {
- // Debug instructions cannot be counted against the limit.
- if (OtherMI.isDebugInstr())
+ // Debug or pseudo instructions cannot be counted against the limit.
+ if (OtherMI.isDebugOrPseudoInstr())
continue;
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
return false;
unsigned NumVisited = 0;
for (MachineInstr &OtherMI :
make_range(mi, MachineBasicBlock::iterator(KillMI))) {
- // Debug instructions cannot be counted against the limit.
- if (OtherMI.isDebugInstr())
+ // Debug or pseudo instructions cannot be counted against the limit.
+ if (OtherMI.isDebugOrPseudoInstr())
continue;
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
return false;
return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end;
}
+bool Instruction::isDebugOrPseudoInst() const {
+ return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
+}
+
const Instruction *
Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
for (const Instruction *I = getNextNode(); I; I = I->getNextNode())
if (isNoModRef(MRI))
continue;
+ // A pseudo probe call shouldn't change any function attribute since it
+ // doesn't translate to a real instruction. It comes with a memory access
+ // tag to prevent itself being removed by optimizations and not block
+ // other instructions being optimized.
+ if (isa<PseudoProbeInst>(I))
+ continue;
+
if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
// The call could access any memory. If that includes writes, note it.
if (isModSet(MRI))
BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
for (++BBI; BBI != E; ++BBI)
- if (BBI->mayWriteToMemory())
+ if (BBI->mayWriteToMemory()) {
+ // Calls that only access inaccessible memory do not block sinking the
+ // load.
+ if (auto *CB = dyn_cast<CallBase>(BBI))
+ if (CB->onlyAccessesInaccessibleMemory())
+ continue;
return false;
+ }
// Check for non-address taken alloca. If not address-taken already, it isn't
// profitable to do this xform.
}
}
- // Skip processing debug intrinsics in InstCombine. Processing these call instructions
- // consumes non-trivial amount of time and provides no value for the optimization.
- if (!isa<DbgInfoIntrinsic>(Inst)) {
+ // Skip processing debug and pseudo intrinsics in InstCombine. Processing
+ // these call instructions consumes non-trivial amount of time and
+ // provides no value for the optimization.
+ if (!Inst->isDebugOrPseudoInst()) {
InstrsForInstCombineWorklist.push_back(Inst);
SeenAliasScopes.analyse(Inst);
}
--- /dev/null
+; RUN: opt -passes=instcombine -S < %s | FileCheck %s
+
+%struct.nonbonded = type { [2 x %struct.CompAtom*], [2 x %struct.CompAtomExt*], [2 x %struct.CompAtom*], [2 x %class.Vector*], [2 x %class.Vector*], [2 x i32], %class.Vector, double*, double*, %class.ComputeNonbondedWorkArrays*, %class.Pairlists*, i32, i32, double, double, i32, i32, i32, i32 }
+%struct.CompAtomExt = type { i32 }
+%struct.CompAtom = type { %class.Vector, float, i16, i8, i8 }
+%class.Vector = type { double, double, double }
+%class.ComputeNonbondedWorkArrays = type { %class.ResizeArray, %class.ResizeArray.0, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray.2, %class.ResizeArray.2 }
+%class.ResizeArray.0 = type { i32 (...)**, %class.ResizeArrayRaw.1* }
+%class.ResizeArrayRaw.1 = type <{ double*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
+%class.ResizeArray = type { i32 (...)**, %class.ResizeArrayRaw* }
+%class.ResizeArrayRaw = type <{ i16*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
+%class.ResizeArray.2 = type { i32 (...)**, %class.ResizeArrayRaw.3* }
+%class.ResizeArrayRaw.3 = type <{ %class.Vector*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
+%class.Pairlists = type { i16*, i32, i32 }
+
+;; Check the minPart4 and minPart assignments are merged.
+; CHECK-COUNT-1: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+; CHECK-NOT: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+
+define dso_local void @_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 {
+entry:
+ %savePairlists3 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 11
+ %0 = load i32, i32* %savePairlists3, align 8
+ %usePairlists4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 12
+ %1 = load i32, i32* %usePairlists4, align 4
+ %tobool54.not = icmp eq i32 %0, 0
+ br i1 %tobool54.not, label %lor.lhs.false55, label %if.end109
+
+lor.lhs.false55: ; preds = %entry
+ %tobool56.not = icmp eq i32 %1, 0
+ br i1 %tobool56.not, label %if.end109, label %if.end109.thread
+
+if.end109.thread: ; preds = %lor.lhs.false55
+ %minPart4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+ %2 = load i32, i32* %minPart4, align 4
+ call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 2, i32 0, i64 -1)
+ br label %if.then138
+
+if.end109: ; preds = %lor.lhs.false55, %entry
+ %minPart = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
+ %3 = load i32, i32* %minPart, align 4
+ call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 3, i32 0, i64 -1)
+ %tobool116.not = icmp eq i32 %1, 0
+ br i1 %tobool116.not, label %if.then117, label %if.then138
+
+if.then117: ; preds = %if.end109
+ ret void
+
+if.then138: ; preds = %if.end109.thread, %if.end109
+ %4 = phi i32 [ %2, %if.end109.thread ], [ %3, %if.end109 ]
+ %tobool139.not = icmp eq i32 %4, 0
+ br i1 %tobool139.not, label %if.else147, label %if.then140
+
+if.then140: ; preds = %if.then138
+ ret void
+
+if.else147: ; preds = %if.then138
+ ret void
+}
+
+declare dso_local void @_ZN9Pairlists8addIndexEv() align 2
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
+
+attributes #0 = { inaccessiblememonly nounwind willreturn }
--- /dev/null
+; PR1075
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -pseudo-probe-for-profiling -O3 | FileCheck %s
+
+define float @foo(float %x) #0 {
+ %tmp1 = fmul float %x, 3.000000e+00
+ %tmp3 = fmul float %x, 5.000000e+00
+ %tmp5 = fmul float %x, 7.000000e+00
+ %tmp7 = fmul float %x, 1.100000e+01
+ call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
+ %tmp10 = fadd float %tmp1, %tmp3
+ %tmp12 = fadd float %tmp10, %tmp5
+ %tmp14 = fadd float %tmp12, %tmp7
+ ret float %tmp14
+; CHECK: mulss
+; CHECK: mulss
+; CHECK: addss
+; CHECK: mulss
+; CHECK: addss
+; CHECK: mulss
+; CHECK: addss
+; CHECK: ret
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { inaccessiblememonly nounwind willreturn }
+
+!llvm.pseudo_probe_desc = !{!0}
+
+!0 = !{i64 6699318081062747564, i64 4294967295, !"foo", null}
+
--- /dev/null
+; RUN: llc -mtriple=x86_64-- -stop-after=peephole-opt -o - %s | FileCheck %s
+
+define internal i32 @arc_compare() {
+entry:
+ %0 = load i64, i64* undef, align 8
+ br i1 undef, label %return, label %if.end
+
+if.end: ; preds = %entry
+; Chek a register copy has been sinked into the compare instruction.
+; CHECK: %[[#REG:]]:gr64 = IMPLICIT_DEF
+; CHECK-NOT: %[[#]]:gr64 = MOV64rm %[[#REG]]
+; CHECK: PSEUDO_PROBE 5116412291814990879, 3, 0, 0
+; CHECK: CMP64mr %[[#REG]], 1
+ call void @llvm.pseudoprobe(i64 5116412291814990879, i64 3, i32 0, i64 -1)
+ %cmp4 = icmp slt i64 %0, undef
+ br i1 %cmp4, label %return, label %if.end6
+
+if.end6: ; preds = %if.end
+ call void @llvm.pseudoprobe(i64 5116412291814990879, i64 5, i32 0, i64 -1)
+ br label %return
+
+return: ; preds = %if.end6, %if.end, %entry
+ ret i32 undef
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
+
+attributes #0 = { inaccessiblememonly nounwind willreturn }
--- /dev/null
+; RUN: llc -stop-after=twoaddressinstruction -mtriple=x86_64-- -o - %s | FileCheck %s
+
+
+define dso_local double @twoaddressinstruction() local_unnamed_addr {
+for.end:
+ %0 = load i64, i64* undef, align 8
+ br label %for.body14.preheader
+
+for.body14.preheader: ; preds = %for.end
+ br i1 undef, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14.preheader.new
+
+for.body14.preheader.new: ; preds = %for.body14.preheader
+ %unroll_iter136 = and i64 %0, -4
+ br label %for.body14
+
+for.cond25.preheader.loopexit.unr-lcssa: ; preds = %for.body14, %for.body14.preheader
+ %indvars.iv127.unr = phi i64 [ 1, %for.body14.preheader ], [ %indvars.iv.next128.3, %for.body14 ]
+ ret double undef
+
+for.body14: ; preds = %for.body14, %for.body14.preheader.new
+ %indvars.iv127 = phi i64 [ 1, %for.body14.preheader.new ], [ %indvars.iv.next128.3, %for.body14 ]
+ %niter137 = phi i64 [ %unroll_iter136, %for.body14.preheader.new ], [ %niter137.nsub.3, %for.body14 ]
+ %indvars.iv.next128.3 = add nuw nsw i64 %indvars.iv127, 4
+; CHECK: PSEUDO_PROBE -6878943695821059507, 9, 0, 0
+ call void @llvm.pseudoprobe(i64 -6878943695821059507, i64 9, i32 0, i64 -1)
+;; Check an opeq form of instruction is created.
+; CHECK: %[[#REG:]]:gr64_nosp = COPY killed %[[#]]
+; CHECK: %[[#REG]]:gr64_nosp = nuw ADD64ri8 %[[#REG]], 4, implicit-def dead $eflags
+ %niter137.nsub.3 = add i64 %niter137, -4
+ %niter137.ncmp.3 = icmp eq i64 %niter137.nsub.3, 0
+ br i1 %niter137.ncmp.3, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14
+}
+
+; Function Attrs: inaccessiblememonly nounwind willreturn
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
+
+attributes #0 = { inaccessiblememonly nounwind willreturn }
\ No newline at end of file