[SLP] match maxnum/minnum intrinsics as FP reduction ops

author Sanjay Patel <spatel@rotateright.com>

Mon, 18 Jan 2021 21:18:57 +0000 (16:18 -0500)

committer Sanjay Patel <spatel@rotateright.com>

Mon, 18 Jan 2021 22:37:16 +0000 (17:37 -0500)
author Sanjay Patel <spatel@rotateright.com>
Mon, 18 Jan 2021 21:18:57 +0000 (16:18 -0500)
committer Sanjay Patel <spatel@rotateright.com>
Mon, 18 Jan 2021 22:37:16 +0000 (17:37 -0500)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

index 0323e02..0fee52d 100644 (file)
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6455,6 +6455,10 @@ class HorizontalReduction {
        case RecurKind::FMul:
          return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                     Name);
+      case RecurKind::FMax:
+        return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+      case RecurKind::FMin:
+        return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
  
        case RecurKind::SMax: {
          Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
@@ -6568,6 +6572,15 @@ class HorizontalReduction {
        if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
          return true;
  
+      if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+        // FP min/max are associative except for NaN and -0.0. We do not
+        // have to rule out -0.0 here because the intrinsic semantics do not
+        // specify a fixed result for it.
+        // TODO: This is artificially restricted to fast because the code that
+        //       creates reductions assumes/produces fast ops.
+        return I->getFastMathFlags().isFast();
+      }
+
        return I->isAssociative();
      }
  
@@ -6677,6 +6690,11 @@ class HorizontalReduction {
      if (match(I, m_FMul(m_Value(), m_Value())))
        return OperationData(RecurKind::FMul);
  
+    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+      return OperationData(RecurKind::FMax);
+    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+      return OperationData(RecurKind::FMin);
+
      if (match(I, m_SMax(m_Value(), m_Value())))
        return OperationData(RecurKind::SMax);
      if (match(I, m_SMin(m_Value(), m_Value())))
@@ -7076,6 +7094,18 @@ private:
        ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
        break;
      }
+    case RecurKind::FMax:
+    case RecurKind::FMin: {
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+      VectorCost =
+          TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+                                      /*pairwise=*/false, /*unsigned=*/false);
+      ScalarCost =
+          TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
+          TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+                                  CmpInst::makeCmpResultType(ScalarTy));
+      break;
+    }
      case RecurKind::SMax:
      case RecurKind::SMin:
      case RecurKind::UMax:
@@ -7307,6 +7337,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
    return nullptr;
  }
  
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  return false;
+}
+
  /// Attempt to reduce a horizontal reduction.
  /// If it is legal to match a horizontal reduction feeding the phi node \a P
  /// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7347,7 +7387,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
      unsigned Level;
      std::tie(Inst, Level) = Stack.pop_back_val();
      Value *B0, *B1;
-    bool IsBinop = match(Inst, m_BinOp(m_Value(B0), m_Value(B1)));
+    bool IsBinop = matchRdxBop(Inst, B0, B1);
      bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
      if (IsBinop || IsSelect) {
        HorizontalReduction HorRdx;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll

index 2c3efaa..a12bd31 100644 (file)
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -330,17 +330,16 @@ for.end:
  define float @fmin_v4i32(float* %p) #0 {
  ; CHECK-LABEL: @fmin_v4i32(
  ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP1]], float [[TMP0]])
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP5]], float [[TMP4]])
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP4:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT5:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP4]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT5]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
  ;
  entry:
    br label %for.cond
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll

index e275486..fc134aa 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -343,14 +343,10 @@ define float @reduction_v4f32_fast(float* %p) {
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
  ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
  ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
  ;
    %g1 = getelementptr inbounds float, float* %p, i64 1
    %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -365,6 +361,8 @@ define float @reduction_v4f32_fast(float* %p) {
    ret float %m3
  }
  
+; TODO: This should become a reduce intrinsic.
+
  define float @reduction_v4f32_nnan(float* %p) {
  ; CHECK-LABEL: @reduction_v4f32_nnan(
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -392,6 +390,8 @@ define float @reduction_v4f32_nnan(float* %p) {
    ret float %m3
  }
  
+; Negative test - must have nnan.
+
  define float @reduction_v4f32_not_fast(float* %p) {
  ; CHECK-LABEL: @reduction_v4f32_not_fast(
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -428,22 +428,10 @@ define float @reduction_v8f32_fast(float* %p) {
  ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
  ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
  ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    [[M4:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT:    [[M5:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT:    [[M6:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT:    [[M7:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT:    ret float [[M7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
  ;
    %g1 = getelementptr inbounds float, float* %p, i64 1
    %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -490,14 +478,10 @@ define double @reduction_v4f64_fast(double* %p) {
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
  ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
  ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT:    ret double [[TMP3]]
  ;
    %g1 = getelementptr inbounds double, double* %p, i64 1
    %g2 = getelementptr inbounds double, double* %p, i64 2
@@ -512,6 +496,8 @@ define double @reduction_v4f64_fast(double* %p) {
    ret double %m3
  }
  
+; Negative test - must have nnan.
+
  define double @reduction_v4f64_wrong_fmf(double* %p) {
  ; CHECK-LABEL: @reduction_v4f64_wrong_fmf(
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll

index 15a7848..e5a4fc2 100644 (file)
--- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -343,14 +343,10 @@ define float @reduction_v4f32_fast(float* %p) {
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
  ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
  ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
  ;
    %g1 = getelementptr inbounds float, float* %p, i64 1
    %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -365,6 +361,8 @@ define float @reduction_v4f32_fast(float* %p) {
    ret float %m3
  }
  
+; TODO: This should become a reduce intrinsic.
+
  define float @reduction_v4f32_nnan(float* %p) {
  ; CHECK-LABEL: @reduction_v4f32_nnan(
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -392,6 +390,8 @@ define float @reduction_v4f32_nnan(float* %p) {
    ret float %m3
  }
  
+; Negative test - must have nnan.
+
  define float @reduction_v4f32_wrong_fmf(float* %p) {
  ; CHECK-LABEL: @reduction_v4f32_wrong_fmf(
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
@@ -428,22 +428,10 @@ define float @reduction_v8f32_fast(float* %p) {
  ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
  ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
  ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    [[M4:%.*]] = tail call fast float @llvm.minnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT:    [[M5:%.*]] = tail call fast float @llvm.minnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT:    [[M6:%.*]] = tail call fast float @llvm.minnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT:    [[M7:%.*]] = tail call fast float @llvm.minnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT:    ret float [[M7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
  ;
    %g1 = getelementptr inbounds float, float* %p, i64 1
    %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -490,14 +478,10 @@ define double @reduction_v4f64_fast(double* %p) {
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
  ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
  ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT:    ret double [[TMP3]]
  ;
    %g1 = getelementptr inbounds double, double* %p, i64 1
    %g2 = getelementptr inbounds double, double* %p, i64 2
@@ -512,6 +496,8 @@ define double @reduction_v4f64_fast(double* %p) {
    ret double %m3
  }
  
+; Negative test - must have nnan.
+
  define double @reduction_v4f64_not_fast(double* %p) {
  ; CHECK-LABEL: @reduction_v4f64_not_fast(
  ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
author	Sanjay Patel <spatel@rotateright.com>
	Mon, 18 Jan 2021 21:18:57 +0000 (16:18 -0500)
committer	Sanjay Patel <spatel@rotateright.com>
	Mon, 18 Jan 2021 22:37:16 +0000 (17:37 -0500)
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp		patch \| blob \| history
llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll		patch \| blob \| history
llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll		patch \| blob \| history