At the IR level, we generally assume that constants are free to materialize. However, for RISCV due to some quirks of the ISA, materializing arbitrary constants can be rather expensive. We frequently fallback to constant pool loads.
We've been slowly moving in the direction of modeling the cost of the remat as part of the instruction cost. This has the effect of disincentivizing vectorization - mostly SLP - when we'd have to materialize an expensive constant.
We need better modeling of which constants are expensive and not, but the moment let's be consistent with how we model arithmetic and memory instructions. The difference between the two is that arithmetic can sometimes fold a splat operation which stores can not.
Differential Revision: https://reviews.llvm.org/D138941
return !XC;
}
-bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
- if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
- return false;
-
- switch (I->getOpcode()) {
+bool RISCVTargetLowering::canSplatOperand(unsigned Opcode, int Operand) const {
+ switch (Opcode) {
case Instruction::Add:
case Instruction::Sub:
case Instruction::Mul:
case Instruction::URem:
case Instruction::SRem:
return Operand == 1;
- case Instruction::Call:
- if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::fma:
- case Intrinsic::vp_fma:
- return Operand == 0 || Operand == 1;
- case Intrinsic::vp_shl:
- case Intrinsic::vp_lshr:
- case Intrinsic::vp_ashr:
- case Intrinsic::vp_udiv:
- case Intrinsic::vp_sdiv:
- case Intrinsic::vp_urem:
- case Intrinsic::vp_srem:
- return Operand == 1;
- // These intrinsics are commutative.
- case Intrinsic::vp_add:
- case Intrinsic::vp_mul:
- case Intrinsic::vp_and:
- case Intrinsic::vp_or:
- case Intrinsic::vp_xor:
- case Intrinsic::vp_fadd:
- case Intrinsic::vp_fmul:
- // These intrinsics have 'vr' versions.
- case Intrinsic::vp_sub:
- case Intrinsic::vp_fsub:
- case Intrinsic::vp_fdiv:
- return Operand == 0 || Operand == 1;
- default:
- return false;
- }
- }
+ default:
return false;
+ }
+}
+
+
+bool RISCVTargetLowering::canSplatOperand(Instruction *I, int Operand) const {
+ if (!I->getType()->isVectorTy() || !Subtarget.hasVInstructions())
+ return false;
+
+ if (canSplatOperand(I->getOpcode(), Operand))
+ return true;
+
+ auto *II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::fma:
+ case Intrinsic::vp_fma:
+ return Operand == 0 || Operand == 1;
+ case Intrinsic::vp_shl:
+ case Intrinsic::vp_lshr:
+ case Intrinsic::vp_ashr:
+ case Intrinsic::vp_udiv:
+ case Intrinsic::vp_sdiv:
+ case Intrinsic::vp_urem:
+ case Intrinsic::vp_srem:
+ return Operand == 1;
+ // These intrinsics are commutative.
+ case Intrinsic::vp_add:
+ case Intrinsic::vp_mul:
+ case Intrinsic::vp_and:
+ case Intrinsic::vp_or:
+ case Intrinsic::vp_xor:
+ case Intrinsic::vp_fadd:
+ case Intrinsic::vp_fmul:
+ // These intrinsics have 'vr' versions.
+ case Intrinsic::vp_sub:
+ case Intrinsic::vp_fsub:
+ case Intrinsic::vp_fdiv:
+ return Operand == 0 || Operand == 1;
default:
return false;
}
/// Return true if the (vector) instruction I will be lowered to an instruction
/// with a scalar splat operand for the given Operand number.
bool canSplatOperand(Instruction *I, int Operand) const;
+ /// Return true if a vector instruction will lower to a target instruction
+ /// able to splat the given operand.
+ bool canSplatOperand(unsigned Opcode, int Operand) const;
bool shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
bool shouldScalarizeBinop(SDValue VecOp) const override;
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
Args, CxtI);
+
+ auto getConstantMatCost =
+ [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
+ if (OpInfo.isUniform() && TLI->canSplatOperand(Opcode, Operand))
+ // Two sub-cases:
+ // * Has a 5 bit immediate operand which can be splatted.
+ // * Has a larger immediate which must be materialized in scalar register
+ // We return 0 for both as we currently ignore the cost of materializing
+ // scalar constants in GPRs.
+ return 0;
+
+ // Add a cost of address generation + the cost of the vector load. The
+ // address is expected to be a PC relative offset to a constant pool entry
+ // using auipc/addi.
+ return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
+ /*AddressSpace=*/0, CostKind);
+ };
+
+ // Add the cost of materializing any constant vectors required.
+ InstructionCost ConstantMatCost = 0;
+ if (Op1Info.isConstant())
+ ConstantMatCost += getConstantMatCost(0, Op1Info);
+ if (Op2Info.isConstant())
+ ConstantMatCost += getConstantMatCost(1, Op2Info);
+
switch (TLI->InstructionOpcodeToISD(Opcode)) {
case ISD::ADD:
case ISD::SUB:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FNEG: {
- // TODO: Add the cost of materializing any constant vectors required since
- // we otherwise treat constants as no-cost.
// TODO: We should be accounting for LMUL and scaling costs for LMUL > 1.
- return LT.first * 1;
+ return ConstantMatCost + LT.first * 1;
}
default:
- return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
+ return ConstantMatCost +
+ BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
Args, CxtI);
}
}
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = add <2 x i64> <i64 1, i64 1>, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = add <4 x i32> <i32 4096, i32 4096, i32 4096, i32 4096>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = add <4 x i32> <i32 1, i32 1, i32 2, i32 1>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = add <4 x i32> <i32 2, i32 1, i32 1, i32 1>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = add <4 x i32> <i32 1, i32 2, i32 3, i32 4>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = add <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -4>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = add <4 x i32> <i32 2, i32 4, i32 6, i32 8>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = add <4 x i32> <i32 -1, i32 0, i32 2, i32 1>, undef
-; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = add <4 x i32> <i32 256, i32 4096, i32 57, i32 1>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %8 = add <4 x i32> <i32 1, i32 1, i32 2, i32 1>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = add <4 x i32> <i32 2, i32 1, i32 1, i32 1>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = add <4 x i32> <i32 0, i32 1, i32 2, i32 3>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %11 = add <4 x i32> <i32 1, i32 2, i32 3, i32 4>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = add <4 x i32> <i32 -1, i32 -2, i32 -3, i32 -4>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = add <4 x i32> <i32 2, i32 4, i32 6, i32 8>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %14 = add <4 x i32> <i32 -1, i32 0, i32 2, i32 1>, undef
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %15 = add <4 x i32> <i32 256, i32 4096, i32 57, i32 1>, undef
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;