Per the documentation in Support/InstructionCost.h, the purpose of an invalid cost is so that clients can change behavior on impossible to cost inputs. CodeMetrics was instead asserting that invalid costs never occurred.
On a target with an incomplete cost model - e.g. RISCV - this means that transformations would crash on (falsely) invalid constructs - e.g. scalable vectors. While we certainly should improve the cost model - and I plan to do so in the near future - we also shouldn't be crashing. This violates the explicitly stated purpose of an invalid InstructionCost.
I updated all of the "easy" consumers where bailouts were locally obvious. I plan to follow up with loop unroll in a following change.
Differential Revision: https://reviews.llvm.org/D127131
#define LLVM_ANALYSIS_CODEMETRICS_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/InstructionCost.h"
namespace llvm {
class AssumptionCache;
/// True if this function calls alloca (in the C sense).
bool usesDynamicAlloca = false;
- /// Number of instructions in the analyzed blocks.
- unsigned NumInsts = false;
+ /// Code size cost of the analyzed blocks.
+ InstructionCost NumInsts = 0;
/// Number of analyzed blocks.
unsigned NumBlocks = false;
/// Keeps track of basic block code size estimates.
- DenseMap<const BasicBlock *, unsigned> NumBBInsts;
+ DenseMap<const BasicBlock *, InstructionCost> NumBBInsts;
/// Keep track of the number of calls to 'big' functions.
unsigned NumCalls = false;
const BasicBlock *BB, const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
++NumBlocks;
- // Use a proxy variable for NumInsts of type InstructionCost, so that it can
- // use InstructionCost's arithmetic properties such as saturation when this
- // feature is added to InstructionCost.
- // When storing the value back to NumInsts, we can assume all costs are Valid
- // because the IR should not contain any nodes that cannot be costed. If that
- // happens the cost-model is broken.
- InstructionCost NumInstsProxy = NumInsts;
InstructionCost NumInstsBeforeThisBB = NumInsts;
for (const Instruction &I : *BB) {
// Skip ephemeral values.
if (InvI->cannotDuplicate())
notDuplicatable = true;
- NumInstsProxy += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
- NumInsts = *NumInstsProxy.getValue();
+ NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
}
if (isa<ReturnInst>(BB->getTerminator()))
notDuplicatable |= isa<IndirectBrInst>(BB->getTerminator());
// Remember NumInsts for this BB.
- InstructionCost NumInstsThisBB = NumInstsProxy - NumInstsBeforeThisBB;
- NumBBInsts[BB] = *NumInstsThisBB.getValue();
+ InstructionCost NumInstsThisBB = NumInsts - NumInstsBeforeThisBB;
+ NumBBInsts[BB] = NumInstsThisBB;
}
// shouldn't specialize it. Set the specialization cost to Invalid.
// Or if the lines of codes implies that this function is easy to get
// inlined so that we shouldn't specialize it.
- if (Metrics.notDuplicatable ||
+ if (Metrics.notDuplicatable || !Metrics.NumInsts.isValid() ||
(!ForceFunctionSpecialization &&
- Metrics.NumInsts < SmallFunctionThreshold)) {
+ *Metrics.NumInsts.getValue() < SmallFunctionThreshold)) {
InstructionCost C{};
C.setInvalid();
return C;
});
return false;
}
+
+ if (!Metrics.NumInsts.isValid()) {
+ LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
+ << "instructions with invalid cost.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst", Switch)
+ << "Contains instructions with invalid cost.";
+ });
+ return false;
+ }
}
unsigned DuplicationCost = 0;
// using binary search, hence the LogBase2().
unsigned CondBranches =
APInt(32, Switch->getNumSuccessors()).ceilLogBase2();
- DuplicationCost = Metrics.NumInsts / CondBranches;
+ DuplicationCost = *Metrics.NumInsts.getValue() / CondBranches;
} else {
// Compared with jump tables, the DFA optimizer removes an indirect branch
// on each loop iteration, thus making branch prediction more precise. The
// predictor to make a mistake, and the more benefit there is in the DFA
// optimizer. Thus, the more branch targets there are, the lower is the
// cost of the DFA opt.
- DuplicationCost = Metrics.NumInsts / JumpTableSize;
+ DuplicationCost = *Metrics.NumInsts.getValue() / JumpTableSize;
}
LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block "
}
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
}
- unsigned LoopSize = Metrics.NumInsts;
+
+ if (!Metrics.NumInsts.isValid())
+ return MadeChange;
+
+ unsigned LoopSize = *Metrics.NumInsts.getValue();
if (!LoopSize)
LoopSize = 1;
NotDuplicatable = Metrics.notDuplicatable;
Convergent = Metrics.convergent;
- unsigned LoopSize = Metrics.NumInsts;
+ // FIXME: This will crash for invalid InstructionCost, we should update the
+ // callers to gracefully bailout in this case.
+ unsigned LoopSize = *Metrics.NumInsts.getValue();
// Don't allow an estimate of size zero. This would allows unrolling of loops
// with huge iteration counts, which is a compile time problem even if it's
L->dump());
return Rotated;
}
- if (Metrics.NumInsts > MaxHeaderSize) {
+ if (!Metrics.NumInsts.isValid()) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
+ " with invalid cost: ";
+ L->dump());
+ return Rotated;
+ }
+ if (*Metrics.NumInsts.getValue() > MaxHeaderSize) {
LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
<< Metrics.NumInsts
<< " instructions, which is more than the threshold ("
ret void
}
+; This demonstrates a case where a) loop rotate needs a cost estimate to
+; know if rotation is profitable, and b) there is no cost estimate available
+; due to invalid costs in the loop. We can't rotate this loop.
+define void @invalid_dup_required(<vscale x 1 x i8>* %p) nounwind ssp {
+; CHECK-LABEL: @invalid_dup_required(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_COND:%.*]]
+; CHECK: for.cond:
+; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
+; CHECK-NEXT: [[A:%.*]] = load <vscale x 1 x i8>, <vscale x 1 x i8>* [[P:%.*]], align 1
+; CHECK-NEXT: [[B:%.*]] = add <vscale x 1 x i8> [[A]], [[A]]
+; CHECK-NEXT: store <vscale x 1 x i8> [[B]], <vscale x 1 x i8>* [[P]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: call void @f()
+; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT: br label [[FOR_COND]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.cond
+
+for.cond: ; preds = %for.body, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+ %a = load <vscale x 1 x i8>, <vscale x 1 x i8>* %p
+ %b = add <vscale x 1 x i8> %a, %a
+ store <vscale x 1 x i8> %b, <vscale x 1 x i8>* %p
+ %cmp = icmp slt i32 %i.0, 100
+ br i1 %cmp, label %for.body, label %for.end
+
+
+for.body: ; preds = %for.cond
+ call void @f()
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond
+
+for.end: ; preds = %for.cond
+ ret void
+}
+
+declare void @f()