case RecurKind::FMul:
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
+ case RecurKind::FMax:
+ return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+ case RecurKind::FMin:
+ return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
case RecurKind::SMax: {
Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
return true;
+ if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+ // FP min/max are associative except for NaN and -0.0. We do not
+ // have to rule out -0.0 here because the intrinsic semantics do not
+ // specify a fixed result for it.
+ // TODO: This is artificially restricted to fast because the code that
+ // creates reductions assumes/produces fast ops.
+ return I->getFastMathFlags().isFast();
+ }
+
return I->isAssociative();
}
if (match(I, m_FMul(m_Value(), m_Value())))
return OperationData(RecurKind::FMul);
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+ return OperationData(RecurKind::FMax);
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+ return OperationData(RecurKind::FMin);
+
if (match(I, m_SMax(m_Value(), m_Value())))
return OperationData(RecurKind::SMax);
if (match(I, m_SMin(m_Value(), m_Value())))
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
break;
}
+ case RecurKind::FMax:
+ case RecurKind::FMin: {
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*pairwise=*/false, /*unsigned=*/false);
+ ScalarCost =
+ TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ break;
+ }
case RecurKind::SMax:
case RecurKind::SMin:
case RecurKind::UMax:
return nullptr;
}
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+ if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+ return true;
+ return false;
+}
+
/// Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding the phi node \a P
/// with reduction operators \a Root (or one of its operands) in a basic block
unsigned Level;
std::tie(Inst, Level) = Stack.pop_back_val();
Value *B0, *B1;
- bool IsBinop = match(Inst, m_BinOp(m_Value(B0), m_Value(B1)));
+ bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
define float @fmin_v4i32(float* %p) #0 {
; CHECK-LABEL: @fmin_v4i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]]
-; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]]
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP1]], float [[TMP0]])
-; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]]
-; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP5]], float [[TMP4]])
-; CHECK-NEXT: ret float [[TMP6]]
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
+; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]]
+; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[RDX_MINMAX_CMP4:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]]
+; CHECK-NEXT: [[RDX_MINMAX_SELECT5:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP4]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF3]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT5]], i32 0
+; CHECK-NEXT: ret float [[TMP2]]
;
entry:
br label %for.cond
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT: ret float [[M3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT: ret float [[TMP3]]
;
%g1 = getelementptr inbounds float, float* %p, i64 1
%g2 = getelementptr inbounds float, float* %p, i64 2
ret float %m3
}
+; TODO: This should become a reduce intrinsic.
+
define float @reduction_v4f32_nnan(float* %p) {
; CHECK-LABEL: @reduction_v4f32_nnan(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
ret float %m3
}
+; Negative test - must have nnan.
+
define float @reduction_v4f32_not_fast(float* %p) {
; CHECK-LABEL: @reduction_v4f32_not_fast(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT: [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT: [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT: [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT: [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT: [[M4:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT: [[M5:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT: [[M6:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT: [[M7:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT: ret float [[M7]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT: ret float [[TMP3]]
;
%g1 = getelementptr inbounds float, float* %p, i64 1
%g2 = getelementptr inbounds float, float* %p, i64 2
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT: [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT: [[M1:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT: [[M2:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT: [[M3:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT: ret double [[M3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT: ret double [[TMP3]]
;
%g1 = getelementptr inbounds double, double* %p, i64 1
%g2 = getelementptr inbounds double, double* %p, i64 2
ret double %m3
}
+; Negative test - must have nnan.
+
define double @reduction_v4f64_wrong_fmf(double* %p) {
; CHECK-LABEL: @reduction_v4f64_wrong_fmf(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT: ret float [[M3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT: ret float [[TMP3]]
;
%g1 = getelementptr inbounds float, float* %p, i64 1
%g2 = getelementptr inbounds float, float* %p, i64 2
ret float %m3
}
+; TODO: This should become a reduce intrinsic.
+
define float @reduction_v4f32_nnan(float* %p) {
; CHECK-LABEL: @reduction_v4f32_nnan(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
ret float %m3
}
+; Negative test - must have nnan.
+
define float @reduction_v4f32_wrong_fmf(float* %p) {
; CHECK-LABEL: @reduction_v4f32_wrong_fmf(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT: [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT: [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT: [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT: [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT: [[M4:%.*]] = tail call fast float @llvm.minnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT: [[M5:%.*]] = tail call fast float @llvm.minnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT: [[M6:%.*]] = tail call fast float @llvm.minnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT: [[M7:%.*]] = tail call fast float @llvm.minnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT: ret float [[M7]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT: ret float [[TMP3]]
;
%g1 = getelementptr inbounds float, float* %p, i64 1
%g2 = getelementptr inbounds float, float* %p, i64 2
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT: [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT: [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT: [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT: [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT: [[M1:%.*]] = tail call fast double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT: [[M2:%.*]] = tail call fast double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT: [[M3:%.*]] = tail call fast double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT: ret double [[M3]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT: ret double [[TMP3]]
;
%g1 = getelementptr inbounds double, double* %p, i64 1
%g2 = getelementptr inbounds double, double* %p, i64 2
ret double %m3
}
+; Negative test - must have nnan.
+
define double @reduction_v4f64_not_fast(double* %p) {
; CHECK-LABEL: @reduction_v4f64_not_fast(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1